library(tidyverse)
## -- Attaching packages --------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.1     v purrr   0.3.4
## v tibble  3.0.0     v dplyr   1.0.0
## v tidyr   1.1.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts ------------------------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(ggplot2)
covid <- readRDS("data/covid_north_complete.RData")
fit = lm(new_cases_per_million~new_tests_per_million+as.factor(continent),data = covid)
summary(fit)
## 
## Call:
## lm(formula = new_cases_per_million ~ new_tests_per_million + 
##     as.factor(continent), data = covid)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -587.04  -55.82  -16.14   12.19 1364.72 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       17.1612093  4.0415175   4.246 2.21e-05 ***
## new_tests_per_million              0.0201693  0.0007605  26.522  < 2e-16 ***
## as.factor(continent)Asia          -1.6806933  5.4900207  -0.306     0.76    
## as.factor(continent)Europe        95.7821128  6.7401297  14.211  < 2e-16 ***
## as.factor(continent)North America 79.4825661  5.3860577  14.757  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 134.3 on 5482 degrees of freedom
## Multiple R-squared:  0.3322, Adjusted R-squared:  0.3317 
## F-statistic: 681.8 on 4 and 5482 DF,  p-value: < 2.2e-16
#Taking out new tests rate effect on new cases rate

residuals = residuals(fit)

new_covid <-
  covid %>% mutate("residuals" = residuals)

new_covid[2] = as.Date(new_covid$Day)

covid_usa <- new_covid %>% filter(location == "United States")

covid_canada <- new_covid %>% filter(location == "Canada")

covid_cote <- new_covid %>% filter(location ==  "Cote d'Ivoire") 
covid_ghana <- new_covid %>% filter(location ==  "Ghana") 
covid_guatemala <- new_covid %>% filter(location ==  "Guatemala") 
covid_india <- new_covid %>% filter(location ==  "India") 
covid_indonesia <- new_covid %>% filter(location ==  "Indonesia") 
covid_italy <- new_covid %>% filter(location ==  "Italy") 
covid_japan <- new_covid %>% filter(location ==  "Japan") 
covid_mexico <- new_covid %>% filter(location ==  "Mexico") 
covid_morocco <- new_covid %>% filter(location ==  "Morocco") 
covid_portugal <- new_covid %>% filter(location ==  "Portugal") 
covid_russia <- new_covid %>% filter(location ==  "Russia") 
covid_saudi <- new_covid %>% filter(location ==  "Saudi Arabia") 
covid_south_africa <- new_covid %>% filter(location ==  "South Africa") 
covid_uk <- new_covid %>% filter(location ==  "United Kingdom")
new_covid_usa <-
  covid_usa %>% mutate("residuals_diff" = residuals)

#deseasonalized
new_covid_usa$residuals_diff[8:409] = new_covid_usa$residuals_diff %>% diff(7)
plot.ts(new_covid_usa$residuals_diff)

# ARIMA as a special case of the ARIMAX intercept
regression_model = lm(residuals_diff~1, data = new_covid_usa[8:379,]) 
summary(regression_model)
## 
## Call:
## lm(formula = residuals_diff ~ 1, data = new_covid_usa[8:379, 
##     ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -521.76  -29.09   -3.84   29.91  514.68 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)    3.353      4.305   0.779    0.437
## 
## Residual standard error: 83.03 on 371 degrees of freedom
anova(regression_model)
## Analysis of Variance Table
## 
## Response: residuals_diff
##            Df  Sum Sq Mean Sq F value Pr(>F)
## Residuals 371 2557845  6894.5
error = residuals(regression_model)
ts_error = ts(error)

#ARIMA on error

error_model = auto.arima(ts_error,d=1)
fitted_error = fitted(error_model)
summary(error_model)
## Series: ts_error 
## ARIMA(0,1,2) 
## 
## Coefficients:
##           ma1     ma2
##       -1.0055  0.3212
## s.e.   0.0504  0.0613
## 
## sigma^2 estimated as 5462:  log likelihood=-2122.31
## AIC=4250.62   AICc=4250.68   BIC=4262.37
## 
## Training set error measures:
##                      ME    RMSE      MAE      MPE    MAPE      MASE        ACF1
## Training set -0.2448158 73.6088 38.59669 3904.676 4149.71 0.7866605 0.003024844
checkresiduals(error_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,2)
## Q* = 54.879, df = 8, p-value = 4.661e-09
## 
## Model df: 2.   Total lags used: 10
#goodness of fit
regression_prediction = predict(regression_model)

final_prediction = regression_prediction + fitted_error

regression_prediction = predict(regression_model, newdata = new_covid_usa[380:409,])
error_forecast =  predict(error_model, n.ahead = 30)$pred

final_prediction_2 = regression_prediction + error_forecast


#compare prediction and actual


compare_table = data.frame(c(final_prediction,final_prediction_2),new_covid_usa$residuals_diff[8:409])
colnames(compare_table) = c("predict","actual")
ggplot(data = new_covid_usa[8:409,]) + 
  geom_line(aes(x=Day, y=residuals_diff, color = "black")) + 
  geom_line(aes(x=Day, y=compare_table$predict, color = "red"))+ 
    labs(x = "Date",
         y = "USA Prediction vs. Acutal")+
  scale_color_manual(name = c("color"),values = c("black","red"),labels = c("Actual","Prediction"))+
  ggtitle("Null Model, ARIMA(0,1,2)")

plot(new_covid_usa$Day[380:409],                              # Draw first time series
     new_covid_usa$residuals_diff[380:409],
     type = "l",
     xlab = "Date",
     ylab = "USA Prediction vs. Acutal: Null Model")
lines(new_covid_usa$Day[380:409],                             # Draw second time series
      final_prediction_2,
      type = "l",
      col = "red")
legend(x = "topright",legend=c("actual", "prediction"),
       col=c("black", "red"), lty=1, cex=0.5)

#Training MSE USA Null
SSE_train = sum((final_prediction - compare_table$actual[1:length(final_prediction)])^2)
MSE_train = SSE_train/length(final_prediction)
MSE_train
## [1] 5418.256
#Testing MSE
SSE_test = sum((final_prediction_2 - compare_table$actual[length(final_prediction)+1:length(final_prediction_2)])^2)
MSE_test = SSE_test/length(final_prediction_2)
MSE_test
## [1] 2231.175
regression_model = lm(residuals_diff~as.factor(vaccination_policy), data = new_covid_usa[8:379,]) 

summary(regression_model)
## 
## Call:
## lm(formula = residuals_diff ~ as.factor(vaccination_policy), 
##     data = new_covid_usa[8:379, ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -526.74  -28.64   -9.58   24.70  509.70 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      39.841      8.701   4.579 6.42e-06 ***
## as.factor(vaccination_policy)1  -78.497     12.630  -6.215 1.40e-09 ***
## as.factor(vaccination_policy)2  -41.438     16.686  -2.483  0.01346 *  
## as.factor(vaccination_policy)3  -38.953     19.747  -1.973  0.04929 *  
## as.factor(vaccination_policy)4  -61.935     21.645  -2.861  0.00446 ** 
## as.factor(vaccination_policy)5  -31.508     10.884  -2.895  0.00402 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 79.27 on 366 degrees of freedom
## Multiple R-squared:  0.1008, Adjusted R-squared:  0.08848 
## F-statistic: 8.203 on 5 and 366 DF,  p-value: 2.34e-07
anova(regression_model)
## Analysis of Variance Table
## 
## Response: residuals_diff
##                                Df  Sum Sq Mean Sq F value   Pr(>F)    
## as.factor(vaccination_policy)   5  257747   51549  8.2027 2.34e-07 ***
## Residuals                     366 2300098    6284                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
error = residuals(regression_model)
ts_error = ts(error)

#ARIMA on error

error_model = auto.arima(ts_error,d=1)
fitted_error = fitted(error_model)
summary(error_model)
## Series: ts_error 
## ARIMA(0,1,2) 
## 
## Coefficients:
##           ma1     ma2
##       -0.9852  0.2860
## s.e.   0.0489  0.0637
## 
## sigma^2 estimated as 5544:  log likelihood=-2125.04
## AIC=4256.08   AICc=4256.14   BIC=4267.83
## 
## Training set error measures:
##                        ME     RMSE      MAE      MPE     MAPE      MASE
## Training set -0.004498275 74.15535 39.19593 51.63764 240.5241 0.7914197
##                      ACF1
## Training set -0.006956154
checkresiduals(error_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,2)
## Q* = 58.988, df = 8, p-value = 7.358e-10
## 
## Model df: 2.   Total lags used: 10
#goodness of fit
regression_prediction = predict(regression_model)

final_prediction = regression_prediction + fitted_error

regression_prediction = predict(regression_model, newdata = new_covid_usa[380:409,])
error_forecast =  predict(error_model, n.ahead = 30)$pred

final_prediction_2 = regression_prediction + error_forecast


#compare prediction and actual


compare_table = data.frame(c(final_prediction,final_prediction_2),new_covid_usa$residuals_diff[8:409])
colnames(compare_table) = c("predict","actual")


ggplot(data = new_covid_usa[8:409,]) + 
  geom_line(aes(x=Day, y=residuals_diff, color = "black")) + 
  geom_line(aes(x=Day, y=compare_table$predict, color = "red"))+ 
    labs(x = "Date",
         y = "USA Prediction vs. Acutal")+
  scale_color_manual(name = c("color"),values = c("black","red"),labels = c("Actual","Prediction"))+
  ggtitle("Full Model, ARIMA(0,1,2)")

plot(new_covid_usa$Day[380:409],                              # Draw first time series
     new_covid_usa$residuals_diff[380:409],
     type = "l",
     xlab = "Date",
     ylab = "USA Prediction vs. Acutal: Full Model")
lines(new_covid_usa$Day[380:409],                             # Draw second time series
      final_prediction_2,
      type = "l",
      col = "red")
legend(x = "topright",legend=c("actual", "prediction"),
       col=c("black", "red"), lty=1, cex=0.5)

#Training MSE USA Full
SSE_train = sum((final_prediction - compare_table$actual[1:length(final_prediction)])^2)
MSE_train = SSE_train/length(final_prediction)
MSE_train
## [1] 5499.016
#Testing MSE
SSE_test = sum((final_prediction_2 - compare_table$actual[length(final_prediction)+1:length(final_prediction_2)])^2)
MSE_test = SSE_test/length(final_prediction_2)
MSE_test
## [1] 2297.392
new_covid_canada <-
  covid_canada %>% mutate("residuals_diff" = residuals)

#deseasonalized
new_covid_canada$residuals_diff[8:409] = new_covid_canada$residuals_diff %>% diff(7)
plot.ts(new_covid_canada$residuals_diff)

# ARIMA as a special case of the ARIMAX intercept
regression_model = lm(residuals_diff~1, data = new_covid_canada[8:379,]) 
summary(regression_model)
## 
## Call:
## lm(formula = residuals_diff ~ 1, data = new_covid_canada[8:379, 
##     ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -163.827  -17.818   -0.067   11.820  211.984 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)    1.355      1.908    0.71    0.478
## 
## Residual standard error: 36.81 on 371 degrees of freedom
anova(regression_model)
## Analysis of Variance Table
## 
## Response: residuals_diff
##            Df Sum Sq Mean Sq F value Pr(>F)
## Residuals 371 502636  1354.8
error = residuals(regression_model)
ts_error = ts(error)

#ARIMA on error

error_model = auto.arima(ts_error,d=1)
fitted_error = fitted(error_model)
summary(error_model)
## Series: ts_error 
## ARIMA(2,1,1) 
## 
## Coefficients:
##           ar1      ar2      ma1
##       -0.2607  -0.2065  -0.7614
## s.e.   0.0685   0.0638   0.0547
## 
## sigma^2 estimated as 1218:  log likelihood=-1843.73
## AIC=3695.46   AICc=3695.57   BIC=3711.13
## 
## Training set error measures:
##                       ME    RMSE      MAE       MPE     MAPE      MASE
## Training set -0.09491957 34.7177 20.38946 -957.0729 1330.579 0.7270513
##                      ACF1
## Training set -0.005374141
checkresiduals(error_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,1,1)
## Q* = 72.669, df = 7, p-value = 4.26e-13
## 
## Model df: 3.   Total lags used: 10
#goodness of fit
regression_prediction = predict(regression_model)

final_prediction = regression_prediction + fitted_error

regression_prediction = predict(regression_model, newdata = new_covid_canada[380:409,])
error_forecast =  predict(error_model, n.ahead = 30)$pred

final_prediction_2 = regression_prediction + error_forecast


#compare prediction and actual


compare_table = data.frame(c(final_prediction,final_prediction_2),new_covid_canada$residuals_diff[8:409])
colnames(compare_table) = c("predict","actual")



ggplot(data = new_covid_canada[8:409,]) + 
  geom_line(aes(x=Day, y=residuals_diff, color = "black")) + 
  geom_line(aes(x=Day, y=compare_table$predict, color = "red"))+ 
    labs(x = "Date",
         y = "Canada Prediction vs. Acutal")+
  scale_color_manual(name = c("color"),values = c("black","red"),labels = c("Actual","Prediction"))+
  ggtitle("Null Model, ARIMA(2,1,1)")

plot(new_covid_canada$Day[380:409],                              # Draw first time series
     new_covid_canada$residuals_diff[380:409],
     type = "l",
     xlab = "Date",
     ylab = "Canada Prediction vs. Acutal: Null Model")
lines(new_covid_canada$Day[380:409],                             # Draw second time series
      final_prediction_2,
      type = "l",
      col = "red")
legend(x = "topright",legend=c("actual", "prediction"),
       col=c("black", "red"), lty=1, cex=0.5)

#Training MSE Canada Null
SSE_train = sum((final_prediction - compare_table$actual[1:length(final_prediction)])^2)
MSE_train = SSE_train/length(final_prediction)
MSE_train
## [1] 1205.318
#Testing MSE
SSE_test = sum((final_prediction_2 - compare_table$actual[length(final_prediction)+1:length(final_prediction_2)])^2)
MSE_test = SSE_test/length(final_prediction_2)
MSE_test
## [1] 3531.228
# ARIMA as a special case of the ARIMAX intercept
regression_model = lm(residuals_diff~as.factor(vaccination_policy), data = new_covid_canada[8:379,]) 
summary(regression_model)
## 
## Call:
## lm(formula = residuals_diff ~ as.factor(vaccination_policy), 
##     data = new_covid_canada[8:379, ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -164.821  -15.648   -1.514   10.815  210.990 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      10.315      3.958   2.606  0.00952 ** 
## as.factor(vaccination_policy)2  -11.258      7.196  -1.565  0.11855    
## as.factor(vaccination_policy)3  -10.727      5.175  -2.073  0.03887 *  
## as.factor(vaccination_policy)4  -40.713      9.375  -4.343 1.82e-05 ***
## as.factor(vaccination_policy)5   -7.966      5.166  -1.542  0.12392    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 36.06 on 367 degrees of freedom
## Multiple R-squared:  0.0507, Adjusted R-squared:  0.04036 
## F-statistic:   4.9 on 4 and 367 DF,  p-value: 0.0007352
anova(regression_model)
## Analysis of Variance Table
## 
## Response: residuals_diff
##                                Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(vaccination_policy)   4  25484  6371.1  4.9003 0.0007352 ***
## Residuals                     367 477152  1300.1                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
error = residuals(regression_model)
ts_error = ts(error)

#ARIMA on error

error_model = auto.arima(ts_error,d=1)
fitted_error = fitted(error_model)
summary(error_model)
## Series: ts_error 
## ARIMA(2,1,1) 
## 
## Coefficients:
##           ar1      ar2      ma1
##       -0.2477  -0.1958  -0.7628
## s.e.   0.0720   0.0664   0.0600
## 
## sigma^2 estimated as 1230:  log likelihood=-1845.44
## AIC=3698.89   AICc=3699   BIC=3714.55
## 
## Training set error measures:
##                      ME     RMSE      MAE      MPE     MAPE      MASE
## Training set 0.03584671 34.87973 20.51357 26.17381 224.0617 0.7366782
##                      ACF1
## Training set -0.001888925
checkresiduals(error_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,1,1)
## Q* = 73.989, df = 7, p-value = 2.3e-13
## 
## Model df: 3.   Total lags used: 10
#goodness of fit
regression_prediction = predict(regression_model)

final_prediction = regression_prediction + fitted_error

regression_prediction = predict(regression_model, newdata = new_covid_canada[380:409,])
error_forecast =  predict(error_model, n.ahead = 30)$pred

final_prediction_2 = regression_prediction + error_forecast


#compare prediction and actual


compare_table = data.frame(c(final_prediction,final_prediction_2),new_covid_canada$residuals_diff[8:409])
colnames(compare_table) = c("predict","actual")



ggplot(data = new_covid_canada[8:409,]) + 
  geom_line(aes(x=Day, y=residuals_diff, color = "black")) + 
  geom_line(aes(x=Day, y=compare_table$predict, color = "red"))+ 
    labs(x = "Date",
         y = "Canada Prediction vs. Acutal")+
  scale_color_manual(name = c("color"),values = c("black","red"),labels = c("Actual","Prediction"))+
  ggtitle("Full Model, ARIMA(2,1,1)")

plot(new_covid_canada$Day[380:409],                              # Draw first time series
     new_covid_canada$residuals_diff[380:409],
     type = "l",
     xlab = "Date",
     ylab = "Canada Prediction vs. Acutal: Full Model")
lines(new_covid_canada$Day[380:409],                             # Draw second time series
      final_prediction_2,
      type = "l",
      col = "red")
legend(x = "topright",legend=c("actual", "prediction"),
       col=c("black", "red"), lty=1, cex=0.5)

#Training MSE Canada Full
SSE_train = sum((final_prediction - compare_table$actual[1:length(final_prediction)])^2)
MSE_train = SSE_train/length(final_prediction)
MSE_train
## [1] 1216.596
#Testing MSE
SSE_test = sum((final_prediction_2 - compare_table$actual[length(final_prediction)+1:length(final_prediction_2)])^2)
MSE_test = SSE_test/length(final_prediction_2)
MSE_test
## [1] 3529.257
#USA : Yt - Mt - St 
ts_covid_usa = ts(covid_usa$new_cases_per_million, frequency = 7)
#decompose(ts_covid_usa)

new_ts_covid_usa = ts_covid_usa - decompose(ts_covid_usa)$seasonal - decompose(ts_covid_usa)$trend

plot.ts(new_ts_covid_usa)

new_covid_usa <-
  new_covid_usa %>% mutate("new_ts_covid_usa" = new_ts_covid_usa) %>%
  na.omit()
regression_model = lm(new_ts_covid_usa~1, data = new_covid_usa[1:373,]) 
summary(regression_model)
## 
## Call:
## lm(formula = new_ts_covid_usa ~ 1, data = new_covid_usa[1:373, 
##     ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -304.65  -28.81   -4.82   38.92  388.12 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.01713    4.26744  -0.004    0.997
## 
## Residual standard error: 82.42 on 372 degrees of freedom
anova(regression_model)
## Analysis of Variance Table
## 
## Response: new_ts_covid_usa
##            Df  Sum Sq Mean Sq F value Pr(>F)
## Residuals 372 2526888  6792.7
error = residuals(regression_model)
ts_error = ts(error)

#ARIMA on error

error_model = auto.arima(ts_error)
fitted_error = fitted(error_model)
summary(error_model)
## Series: ts_error 
## ARIMA(5,0,3) with zero mean 
## 
## Coefficients:
##           ar1      ar2      ar3      ar4      ar5      ma1     ma2      ma3
##       -0.0456  -0.8651  -0.2450  -0.3038  -0.5619  -0.7014  0.5365  -0.6413
## s.e.   0.0536   0.0472   0.0651   0.0422   0.0466   0.0595  0.0508   0.0661
## 
## sigma^2 estimated as 2326:  log likelihood=-1973.94
## AIC=3965.89   AICc=3966.38   BIC=4001.18
## 
## Training set error measures:
##                   ME     RMSE      MAE       MPE     MAPE      MASE        ACF1
## Training set 1.04331 47.71312 31.00802 -49.67664 453.6815 0.4065845 -0.08202149
checkresiduals(error_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(5,0,3) with zero mean
## Q* = 67.776, df = 3, p-value = 1.277e-14
## 
## Model df: 8.   Total lags used: 11
#goodness of fit
regression_prediction = predict(regression_model)

final_prediction = regression_prediction + fitted_error

regression_prediction = predict(regression_model, newdata = new_covid_usa[374:403,])
error_forecast =  predict(error_model, n.ahead = 30)$pred

final_prediction_2 = regression_prediction + error_forecast


#compare prediction and actual


compare_table = data.frame(c(final_prediction,final_prediction_2),new_covid_usa$new_ts_covid_usa[1:403])
colnames(compare_table) = c("predict","actual")


ggplot(data = new_covid_usa[1:403,]) + 
  geom_line(aes(x=Day, y=new_ts_covid_usa, color = "black")) + 
  geom_line(aes(x=Day, y=compare_table$predict, color = "red"))+ 
    labs(x = "Date",
         y = "USA Prediction vs. Acutal")+
  scale_color_manual(name = c("color"),values = c("black","red"),labels = c("Actual","Prediction"))+
  ggtitle("Null Model, ARIMA(5,0,3)")

plot(covid_usa$Day[374:403],                              # Draw first time series
     new_covid_usa$new_ts_covid_usa[374:403],
     type = "l",
     xlab = "Date",
     ylab = "USA Prediction vs. Acutal: Null Model")
lines(covid_usa$Day[374:403],                             # Draw second time series
      final_prediction_2,
      type = "l",
      col = "red")
legend(x = "topright",legend=c("actual", "prediction"),
       col=c("black", "red"), lty=1, cex=0.5)

#Training MSE USA Null Method2
SSE_train = sum((final_prediction - compare_table$actual[1:length(final_prediction)])^2)
MSE_train = SSE_train/length(final_prediction)
MSE_train
## [1] 2276.541
#Testing MSE
SSE_test = sum((final_prediction_2 - compare_table$actual[length(final_prediction)+1:length(final_prediction_2)])^2)
MSE_test = SSE_test/length(final_prediction_2)
MSE_test
## [1] 4041.949
regression_model = lm(new_ts_covid_usa~as.factor(vaccination_policy), data = new_covid_usa[1:373,]) 
summary(regression_model)
## 
## Call:
## lm(formula = new_ts_covid_usa ~ as.factor(vaccination_policy), 
##     data = new_covid_usa[1:373, ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -303.60  -28.50   -4.74   38.76  388.46 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)
## (Intercept)                      0.6476     8.8956   0.073    0.942
## as.factor(vaccination_policy)1  -1.7141    13.0738  -0.131    0.896
## as.factor(vaccination_policy)2   1.5087    17.3555   0.087    0.931
## as.factor(vaccination_policy)3  -0.5040    20.5756  -0.024    0.980
## as.factor(vaccination_policy)4  -0.7638    22.5702  -0.034    0.973
## as.factor(vaccination_policy)5  -0.9990    11.2668  -0.089    0.929
## 
## Residual standard error: 82.97 on 367 degrees of freedom
## Multiple R-squared:  0.0001125,  Adjusted R-squared:  -0.01351 
## F-statistic: 0.008257 on 5 and 367 DF,  p-value: 1
anova(regression_model)
## Analysis of Variance Table
## 
## Response: new_ts_covid_usa
##                                Df  Sum Sq Mean Sq F value Pr(>F)
## as.factor(vaccination_policy)   5     284    56.8  0.0083      1
## Residuals                     367 2526604  6884.5
error = residuals(regression_model)
ts_error = ts(error)

#ARIMA on error

error_model = auto.arima(ts_error)
fitted_error = fitted(error_model)
summary(error_model)
## Series: ts_error 
## ARIMA(2,0,2) with zero mean 
## 
## Coefficients:
##          ar1      ar2      ma1     ma2
##       0.8430  -0.5297  -1.5722  0.6614
## s.e.  0.0632   0.0541   0.0508  0.0632
## 
## sigma^2 estimated as 3624:  log likelihood=-2057.01
## AIC=4124.02   AICc=4124.18   BIC=4143.63
## 
## Training set error measures:
##                     ME     RMSE      MAE      MPE     MAPE      MASE
## Training set 0.8497896 59.87576 39.16667 191.5565 368.7459 0.5136889
##                     ACF1
## Training set -0.03077804
checkresiduals(error_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,0,2) with zero mean
## Q* = 162.52, df = 6, p-value < 2.2e-16
## 
## Model df: 4.   Total lags used: 10
#goodness of fit
regression_prediction = predict(regression_model)

final_prediction = regression_prediction + fitted_error

regression_prediction = predict(regression_model, newdata = new_covid_usa[374:403,])
error_forecast =  predict(error_model, n.ahead = 30)$pred

final_prediction_2 = regression_prediction + error_forecast


#compare prediction and actual


compare_table = data.frame(c(final_prediction,final_prediction_2),new_covid_usa$new_ts_covid_usa[1:403])
colnames(compare_table) = c("predict","actual")


ggplot(data = new_covid_usa[1:403,]) + 
  geom_line(aes(x=Day, y=new_ts_covid_usa, color = "black")) + 
  geom_line(aes(x=Day, y=compare_table$predict, color = "red"))+ 
    labs(x = "Date",
         y = "USA Prediction vs. Acutal")+
  scale_color_manual(name = c("color"),values = c("black","red"),labels = c("Actual","Prediction"))+
  ggtitle("Full Model, ARIMA(2,0,2)")

plot(covid_usa$Day[374:403],                              # Draw first time series
     new_covid_usa$new_ts_covid_usa[374:403],
     type = "l",
     xlab = "Date",
     ylab = "USA Prediction vs. Acutal: Full Model")
lines(covid_usa$Day[374:403],                             # Draw second time series
      final_prediction_2,
      type = "l",
      col = "red")
legend(x = "topright",legend=c("actual", "prediction"),
       col=c("black", "red"), lty=1, cex=0.5)

#Training MSE USA Full Method 2
SSE_train = sum((final_prediction - compare_table$actual[1:length(final_prediction)])^2)
MSE_train = SSE_train/length(final_prediction)
MSE_train
## [1] 3585.106
#Testing MSE
SSE_test = sum((final_prediction_2 - compare_table$actual[length(final_prediction)+1:length(final_prediction_2)])^2)
MSE_test = SSE_test/length(final_prediction_2)
MSE_test
## [1] 7410.106
#Canada : Yt - Mt - St 
ts_covid_canada = ts(covid_canada$new_cases_per_million, frequency = 7)
#decompose(ts_covid_canada)

new_ts_covid_canada = ts_covid_canada - decompose(ts_covid_canada)$seasonal - decompose(ts_covid_canada)$trend

plot.ts(new_ts_covid_canada)

new_covid_canada <-
  new_covid_canada %>% mutate("new_ts_covid_canada" = new_ts_covid_canada) %>%
  na.omit()
regression_model = lm(new_ts_covid_canada~1, data = new_covid_canada[1:373,]) 
summary(regression_model)
## 
## Call:
## lm(formula = new_ts_covid_canada ~ 1, data = new_covid_canada[1:373, 
##     ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -116.663   -8.996   -0.421    8.972  159.511 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  -0.2513     1.2566    -0.2    0.842
## 
## Residual standard error: 24.27 on 372 degrees of freedom
anova(regression_model)
## Analysis of Variance Table
## 
## Response: new_ts_covid_canada
##            Df Sum Sq Mean Sq F value Pr(>F)
## Residuals 372 219109     589
error = residuals(regression_model)
ts_error = ts(error)

#ARIMA on error

error_model = auto.arima(ts_error)
fitted_error = fitted(error_model)
summary(error_model)
## Series: ts_error 
## ARIMA(5,0,1) with non-zero mean 
## 
## Coefficients:
##           ar1      ar2      ar3      ar4      ar5      ma1    mean
##       -0.0057  -0.4892  -0.3502  -0.2449  -0.2394  -0.7730  0.2427
## s.e.   0.0730   0.0590   0.0656   0.0591   0.0640   0.0604  0.0897
## 
## sigma^2 estimated as 312.8:  log likelihood=-1598.8
## AIC=3213.59   AICc=3213.99   BIC=3244.96
## 
## Training set error measures:
##                      ME     RMSE      MAE       MPE     MAPE      MASE
## Training set 0.01631147 17.51833 11.42657 -8.796033 310.0721 0.5043037
##                     ACF1
## Training set -0.01265951
checkresiduals(error_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(5,0,1) with non-zero mean
## Q* = 7.2743, df = 3, p-value = 0.06365
## 
## Model df: 7.   Total lags used: 10
#goodness of fit
regression_prediction = predict(regression_model)

final_prediction = regression_prediction + fitted_error

regression_prediction = predict(regression_model, newdata = new_covid_canada[374:403,])
error_forecast =  predict(error_model, n.ahead = 30)$pred

final_prediction_2 = regression_prediction + error_forecast


#compare prediction and actual


compare_table = data.frame(c(final_prediction,final_prediction_2),new_covid_canada$new_ts_covid_canada[1:403])
colnames(compare_table) = c("predict","actual")


ggplot(data = new_covid_canada[1:403,]) + 
  geom_line(aes(x=Day, y=new_ts_covid_canada, color = "black")) + 
  geom_line(aes(x=Day, y=compare_table$predict, color = "red"))+ 
    labs(x = "Date",
         y = "Canada Prediction vs. Acutal")+
  scale_color_manual(name = c("color"),values = c("black","red"),labels = c("Actual","Prediction"))+
  ggtitle("Null Model, ARIMA(5,0,1)")

plot(new_covid_canada$Day[374:403],                              # Draw first time series
     new_covid_canada$new_ts_covid_canada[374:403],
     type = "l",
     xlab = "Date",
     ylab = "Canada Prediction vs. Acutal: Null Model")
lines(new_covid_canada$Day[374:403],                             # Draw second time series
      final_prediction_2,
      type = "l",
      col = "red")
legend(x = "topright",legend=c("actual", "prediction"),
       col=c("black", "red"), lty=1, cex=0.5)

#Training MSE Canada Null Method 2
SSE_train = sum((final_prediction - compare_table$actual[1:length(final_prediction)])^2)
MSE_train = SSE_train/length(final_prediction)
MSE_train
## [1] 306.8921
#Testing MSE
SSE_test = sum((final_prediction_2 - compare_table$actual[length(final_prediction)+1:length(final_prediction_2)])^2)
MSE_test = SSE_test/length(final_prediction_2)
MSE_test
## [1] 1752.865
regression_model = lm(new_ts_covid_canada~as.factor(vaccination_policy), data = new_covid_canada[1:373,]) 
summary(regression_model)
## 
## Call:
## lm(formula = new_ts_covid_canada ~ as.factor(vaccination_policy), 
##     data = new_covid_canada[1:373, ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -116.340   -9.528   -0.078    9.255  160.184 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)
## (Intercept)                      0.2814     2.6154   0.108    0.914
## as.factor(vaccination_policy)2  -0.8559     4.8344  -0.177    0.860
## as.factor(vaccination_policy)3  -0.1180     3.4536  -0.034    0.973
## as.factor(vaccination_policy)4  -0.8518     6.3169  -0.135    0.893
## as.factor(vaccination_policy)5  -1.2065     3.4663  -0.348    0.728
## 
## Residual standard error: 24.4 on 368 degrees of freedom
## Multiple R-squared:  0.0004683,  Adjusted R-squared:  -0.0104 
## F-statistic: 0.0431 on 4 and 368 DF,  p-value: 0.9965
anova(regression_model)
## Analysis of Variance Table
## 
## Response: new_ts_covid_canada
##                                Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(vaccination_policy)   4    103   25.65  0.0431 0.9965
## Residuals                     368 219006  595.12
error = residuals(regression_model)
ts_error = ts(error)

#ARIMA on error

error_model = auto.arima(ts_error)
fitted_error = fitted(error_model)
summary(error_model)
## Series: ts_error 
## ARIMA(5,0,2) with zero mean 
## 
## Coefficients:
##          ar1      ar2      ar3      ar4      ar5      ma1     ma2
##       0.4319  -0.6236  -0.2139  -0.1885  -0.2052  -1.1729  0.4948
## s.e.  0.1398   0.0638   0.0853   0.0624   0.0721   0.1338  0.1166
## 
## sigma^2 estimated as 322.3:  log likelihood=-1604.25
## AIC=3224.5   AICc=3224.89   BIC=3255.87
## 
## Training set error measures:
##                   ME     RMSE      MAE      MPE     MAPE      MASE         ACF1
## Training set 1.25696 17.78253 11.55249 189.9477 556.9271 0.5099318 -0.009867794
checkresiduals(error_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(5,0,2) with zero mean
## Q* = 3.473, df = 3, p-value = 0.3243
## 
## Model df: 7.   Total lags used: 10
#goodness of fit
regression_prediction = predict(regression_model)

final_prediction = regression_prediction + fitted_error

regression_prediction = predict(regression_model, newdata = new_covid_canada[374:403,])
error_forecast =  predict(error_model, n.ahead = 30)$pred

final_prediction_2 = regression_prediction + error_forecast


#compare prediction and actual


compare_table = data.frame(c(final_prediction,final_prediction_2),new_covid_canada$new_ts_covid_canada[1:403])
colnames(compare_table) = c("predict","actual")


ggplot(data = new_covid_canada[1:403,]) + 
  geom_line(aes(x=Day, y=new_ts_covid_canada, color = "black")) + 
  geom_line(aes(x=Day, y=compare_table$predict, color = "red"))+ 
    labs(x = "Date",
         y = "Canada Prediction vs. Acutal")+
  scale_color_manual(name = c("color"),values = c("black","red"),labels = c("Actual","Prediction"))+
  ggtitle("Full Model, ARIMA(5,0,2)")

plot(new_covid_canada$Day[374:403],                              # Draw first time series
     new_covid_canada$new_ts_covid_canada[374:403],
     type = "l",
     xlab = "Date",
     ylab = "Canada Prediction vs. Acutal: Full Model")
lines(new_covid_canada$Day[374:403],                             # Draw second time series
      final_prediction_2,
      type = "l",
      col = "red")
legend(x = "topright",legend=c("actual", "prediction"),
       col=c("black", "red"), lty=1, cex=0.5)

#Training MSE Canada Full Method 2
SSE_train = sum((final_prediction - compare_table$actual[1:length(final_prediction)])^2)
MSE_train = SSE_train/length(final_prediction)
MSE_train
## [1] 316.2185
#Testing MSE
SSE_test = sum((final_prediction_2 - compare_table$actual[length(final_prediction)+1:length(final_prediction_2)])^2)
MSE_test = SSE_test/length(final_prediction_2)
MSE_test
## [1] 1772.211
#USA example aggregate weekly data
ts_covid_usa = ts(covid_usa$residuals,frequency = 7)
ts_covid_usa_2 = ts(colSums(matrix(ts_covid_usa, nrow=7))) 
## Warning in matrix(ts_covid_usa, nrow = 7): data length [409] is not a sub-
## multiple or multiple of the number of rows [7]
vaccine_policy = colSums(matrix(as.numeric(covid_usa$vaccination_policy), nrow=7))
## Warning in matrix(as.numeric(covid_usa$vaccination_policy), nrow = 7): data
## length [409] is not a sub-multiple or multiple of the number of rows [7]
new_ts_covid_usa = data.frame(1:59,ts_covid_usa_2,vaccine_policy)
names(new_ts_covid_usa) = c("week","weekly_aggregated_residuals","vaccination_policy")

plot.ts(ts_covid_usa_2)

regression_model = lm(weekly_aggregated_residuals~1, data = new_ts_covid_usa[1:53,]) 
summary(regression_model)
## 
## Call:
## lm(formula = weekly_aggregated_residuals ~ 1, data = new_ts_covid_usa[1:53, 
##     ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1526.0  -993.1  -603.0  1075.4  3009.7 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    820.5      174.5   4.702 1.94e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1270 on 52 degrees of freedom
anova(regression_model)
## Analysis of Variance Table
## 
## Response: weekly_aggregated_residuals
##           Df   Sum Sq Mean Sq F value Pr(>F)
## Residuals 52 83935233 1614139
error = residuals(regression_model)
ts_error = ts(error)

#ARIMA on error

error_model = auto.arima(ts_error)
fitted_error = fitted(error_model)
summary(error_model)
## Series: ts_error 
## ARIMA(1,1,0) 
## 
## Coefficients:
##          ar1
##       0.3295
## s.e.  0.1337
## 
## sigma^2 estimated as 140228:  log likelihood=-381.46
## AIC=766.93   AICc=767.17   BIC=770.83
## 
## Training set error measures:
##                    ME     RMSE      MAE      MPE     MAPE      MASE       ACF1
## Training set 17.41164 367.3372 247.9875 -15.6555 40.55201 0.8408558 -0.0331298
checkresiduals(error_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,0)
## Q* = 10.186, df = 9, p-value = 0.3356
## 
## Model df: 1.   Total lags used: 10
#goodness of fit
regression_prediction = predict(regression_model)

final_prediction = regression_prediction + fitted_error

regression_prediction = predict(regression_model, newdata = new_ts_covid_usa[54:58,])
error_forecast =  predict(error_model, n.ahead = 5)$pred

final_prediction_2 = regression_prediction + error_forecast


#compare prediction and actual


compare_table = data.frame(c(final_prediction,final_prediction_2),new_ts_covid_usa$weekly_aggregated_residuals[1:58])
colnames(compare_table) = c("predict","actual")




ggplot(data = new_ts_covid_usa[1:58,]) + 
  geom_line(aes(x=week, y=weekly_aggregated_residuals, color = "black")) + 
  geom_line(aes(x=week, y=compare_table$predict, color = "red"))+ 
    labs(x = "Week",
         y = "USA Prediction vs. Acutal")+
  scale_color_manual(name = c("color"),values = c("black","red"),labels = c("Actual","Prediction"))+
  ggtitle("Null Model, ARIMA(1,1,0)")

plot(new_ts_covid_usa$week[54:58],                              # Draw first time series
     new_ts_covid_usa$weekly_aggregated_residuals[54:58],
     type = "l",
     xlab = "Date",
     ylab = "USA Prediction vs. Acutal: Null Model")
lines(new_ts_covid_usa$week[54:58],                            # Draw second time series
      final_prediction_2,
      type = "l",
      col = "red")
legend(x = "topright",legend=c("actual", "prediction"),
       col=c("black", "red"), lty=1, cex=0.5)

#Training MSE USA Null Method 3
SSE_train = sum((final_prediction - compare_table$actual[1:length(final_prediction)])^2)
MSE_train = SSE_train/length(final_prediction)
MSE_train
## [1] 134936.6
#Testing MSE
SSE_test = sum((final_prediction_2 - compare_table$actual[length(final_prediction)+1:length(final_prediction_2)])^2)
MSE_test = SSE_test/length(final_prediction_2)
MSE_test
## [1] 418526.4
regression_model = lm(weekly_aggregated_residuals~vaccination_policy, data = new_ts_covid_usa[1:53,]) 
summary(regression_model)
## 
## Call:
## lm(formula = weekly_aggregated_residuals ~ vaccination_policy, 
##     data = new_ts_covid_usa[1:53, ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1607.8  -947.5  -544.2  1020.5  2658.9 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         1616.21     322.63   5.010 6.92e-06 ***
## vaccination_policy   -31.78      11.11  -2.861   0.0061 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1191 on 51 degrees of freedom
## Multiple R-squared:  0.1383, Adjusted R-squared:  0.1214 
## F-statistic: 8.188 on 1 and 51 DF,  p-value: 0.006101
anova(regression_model)
## Analysis of Variance Table
## 
## Response: weekly_aggregated_residuals
##                    Df   Sum Sq  Mean Sq F value   Pr(>F)   
## vaccination_policy  1 11611275 11611275  8.1878 0.006101 **
## Residuals          51 72323958  1418117                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
error = residuals(regression_model)
ts_error = ts(error)

#ARIMA on error


error_model = auto.arima(ts_error)
fitted_error = fitted(error_model)
summary(error_model)
## Series: ts_error 
## ARIMA(5,0,0) with zero mean 
## 
## Coefficients:
##          ar1      ar2     ar3     ar4      ar5
##       1.0396  -0.0667  0.0437  0.2100  -0.4161
## s.e.  0.1295   0.2147  0.2321  0.2332   0.1417
## 
## sigma^2 estimated as 105262:  log likelihood=-380.99
## AIC=773.97   AICc=775.8   BIC=785.79
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE     MAPE      MASE
## Training set -8.56323 308.7581 238.2628 0.6498531 37.30185 0.8149303
##                     ACF1
## Training set -0.02779966
checkresiduals(error_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(5,0,0) with zero mean
## Q* = 1.6662, df = 5, p-value = 0.8931
## 
## Model df: 5.   Total lags used: 10
#goodness of fit
regression_prediction = predict(regression_model)

final_prediction = regression_prediction + fitted_error

regression_prediction = predict(regression_model, newdata = new_ts_covid_usa[54:58,])
error_forecast =  predict(error_model, n.ahead = 5)$pred

final_prediction_2 = regression_prediction + error_forecast


#compare prediction and actual


compare_table = data.frame(c(final_prediction,final_prediction_2),new_ts_covid_usa$weekly_aggregated_residuals[1:58])
colnames(compare_table) = c("predict","actual")


ggplot(data = new_ts_covid_usa[1:58,]) + 
  geom_line(aes(x=week, y=weekly_aggregated_residuals, color = "black")) + 
  geom_line(aes(x=week, y=compare_table$predict, color = "red"))+ 
    labs(x = "Week",
         y = "USA Prediction vs. Acutal")+
  scale_color_manual(name = c("color"),values = c("black","red"),labels = c("Actual","Prediction"))+
  ggtitle("Full Model, ARIMA(5,0,0)")

plot(new_ts_covid_usa$week[54:58],                              # Draw first time series
     new_ts_covid_usa$weekly_aggregated_residuals[54:58],
     type = "l",
     xlab = "Date",
     ylab = "USA Prediction vs. Acutal: Full Model")
lines(new_ts_covid_usa$week[54:58],                            # Draw second time series
      final_prediction_2,
      type = "l",
      col = "red")
legend(x = "topright",legend=c("actual", "prediction"),
       col=c("black", "red"), lty=1, cex=0.5)

#Training MSE USA Full Method3
SSE_train = sum((final_prediction - compare_table$actual[1:length(final_prediction)])^2)
MSE_train = SSE_train/length(final_prediction)
MSE_train
## [1] 95331.56
#Testing MSE
SSE_test = sum((final_prediction_2 - compare_table$actual[length(final_prediction)+1:length(final_prediction_2)])^2)
MSE_test = SSE_test/length(final_prediction_2)
MSE_test
## [1] 67947.85
#Canada Example on aggregate weekly
ts_covid_canada = ts(covid_canada$residuals,frequency = 7)
ts_covid_canada_2 = ts(colSums(matrix(ts_covid_canada, nrow=7))) 
## Warning in matrix(ts_covid_canada, nrow = 7): data length [409] is not a sub-
## multiple or multiple of the number of rows [7]
vaccine_policy = colSums(matrix(as.numeric(covid_canada$vaccination_policy), nrow=7))
## Warning in matrix(as.numeric(covid_canada$vaccination_policy), nrow = 7): data
## length [409] is not a sub-multiple or multiple of the number of rows [7]
new_ts_covid_canada = data.frame(1:59,ts_covid_canada_2,vaccine_policy)
names(new_ts_covid_canada) = c("week","weekly_aggregated_residuals","vaccination_policy")

plot.ts(ts_covid_canada_2)

regression_model = lm(weekly_aggregated_residuals~1, data = new_ts_covid_canada[1:53,]) 
summary(regression_model)
## 
## Call:
## lm(formula = weekly_aggregated_residuals ~ 1, data = new_ts_covid_canada[1:53, 
##     ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -538.06 -319.27  -50.34  297.63  737.85 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -309.04      50.45  -6.126 1.22e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 367.3 on 52 degrees of freedom
anova(regression_model)
## Analysis of Variance Table
## 
## Response: weekly_aggregated_residuals
##           Df  Sum Sq Mean Sq F value Pr(>F)
## Residuals 52 7013442  134874
error = residuals(regression_model)
ts_error = ts(error)

#ARIMA on error

error_model = auto.arima(ts_error)
fitted_error = fitted(error_model)
summary(error_model)
## Series: ts_error 
## ARIMA(2,0,0) with zero mean 
## 
## Coefficients:
##          ar1      ar2
##       1.5911  -0.6923
## s.e.  0.0937   0.0944
## 
## sigma^2 estimated as 8017:  log likelihood=-314.13
## AIC=634.26   AICc=634.75   BIC=640.17
## 
## Training set error measures:
##                   ME     RMSE      MAE      MPE     MAPE      MASE        ACF1
## Training set 0.56964 87.83277 58.27775 8.972917 36.87626 0.6045546 -0.06785543
checkresiduals(error_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,0,0) with zero mean
## Q* = 5.3504, df = 8, p-value = 0.7195
## 
## Model df: 2.   Total lags used: 10
#goodness of fit
regression_prediction = predict(regression_model)

final_prediction = regression_prediction + fitted_error

regression_prediction = predict(regression_model, newdata = new_ts_covid_canada[54:58,])
error_forecast =  predict(error_model, n.ahead = 5)$pred

final_prediction_2 = regression_prediction + error_forecast


#compare prediction and actual


compare_table = data.frame(c(final_prediction,final_prediction_2),new_ts_covid_canada$weekly_aggregated_residuals[1:58])
colnames(compare_table) = c("predict","actual")


ggplot(data = new_ts_covid_canada[1:58,]) + 
  geom_line(aes(x=week, y=weekly_aggregated_residuals, color = "black")) + 
  geom_line(aes(x=week, y=compare_table$predict, color = "red"))+ 
    labs(x = "Week",
         y = "Canada Prediction vs. Acutal")+
  scale_color_manual(name = c("color"),values = c("black","red"),labels = c("Actual","Prediction"))+
  ggtitle("Null Model, ARIMA(2,0,0)")

plot(new_ts_covid_canada$week[54:58],                              # Draw first time series
     new_ts_covid_canada$weekly_aggregated_residuals[54:58],
     type = "l",
     xlab = "Date",
     ylab = "Canada Prediction vs. Acutal: Null Model")
lines(new_ts_covid_canada$week[54:58],                            # Draw second time series
      final_prediction_2,
      type = "l",
      col = "red")
legend(x = "topright",legend=c("actual", "prediction"),
       col=c("black", "red"), lty=1, cex=0.5)

#Training MSE Canada Null method 3
SSE_train = sum((final_prediction - compare_table$actual[1:length(final_prediction)])^2)
MSE_train = SSE_train/length(final_prediction)
MSE_train
## [1] 7714.595
#Testing MSE
SSE_test = sum((final_prediction_2 - compare_table$actual[length(final_prediction)+1:length(final_prediction_2)])^2)
MSE_test = SSE_test/length(final_prediction_2)
MSE_test
## [1] 16810.94
regression_model = lm(weekly_aggregated_residuals~vaccination_policy, data = new_ts_covid_canada[1:53,]) 
summary(regression_model)
## 
## Call:
## lm(formula = weekly_aggregated_residuals ~ vaccination_policy, 
##     data = new_ts_covid_canada[1:53, ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -615.71 -274.08  -40.79  235.97  747.40 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)  
## (Intercept)        -110.149    112.006  -0.983   0.3300  
## vaccination_policy   -7.444      3.768  -1.976   0.0536 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 357.4 on 51 degrees of freedom
## Multiple R-squared:  0.07109,    Adjusted R-squared:  0.05288 
## F-statistic: 3.903 on 1 and 51 DF,  p-value: 0.05362
anova(regression_model)
## Analysis of Variance Table
## 
## Response: weekly_aggregated_residuals
##                    Df  Sum Sq Mean Sq F value  Pr(>F)  
## vaccination_policy  1  498583  498583   3.903 0.05362 .
## Residuals          51 6514859  127742                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
error = residuals(regression_model)
ts_error = ts(error)

#ARIMA on error

error_model = auto.arima(ts_error)
fitted_error = fitted(error_model)
summary(error_model)
## Series: ts_error 
## ARIMA(2,0,1) with zero mean 
## 
## Coefficients:
##          ar1      ar2      ma1
##       1.7818  -0.8713  -0.4328
## s.e.  0.0923   0.0883   0.1981
## 
## sigma^2 estimated as 7966:  log likelihood=-313.48
## AIC=634.97   AICc=635.8   BIC=642.85
## 
## Training set error measures:
##                     ME     RMSE      MAE      MPE     MAPE      MASE
## Training set 0.3769429 86.69048 56.97042 231.7411 304.8481 0.5984639
##                      ACF1
## Training set 0.0006284805
checkresiduals(error_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,0,1) with zero mean
## Q* = 2.4597, df = 7, p-value = 0.9301
## 
## Model df: 3.   Total lags used: 10
#goodness of fit
regression_prediction = predict(regression_model)

final_prediction = regression_prediction + fitted_error

regression_prediction = predict(regression_model, newdata = new_ts_covid_canada[54:58,])
error_forecast =  predict(error_model, n.ahead = 5)$pred

final_prediction_2 = regression_prediction + error_forecast


#compare prediction and actual


compare_table = data.frame(c(final_prediction,final_prediction_2),new_ts_covid_canada$weekly_aggregated_residuals[1:58])
colnames(compare_table) = c("predict","actual")


ggplot(data = new_ts_covid_canada[1:58,]) + 
  geom_line(aes(x=week, y=weekly_aggregated_residuals, color = "black")) + 
  geom_line(aes(x=week, y=compare_table$predict, color = "red"))+ 
    labs(x = "Week",
         y = "Canada Prediction vs. Acutal")+
  scale_color_manual(name = c("color"),values = c("black","red"),labels = c("Actual","Prediction"))+
  ggtitle("Full Model, ARIMA(2,0,1)")

plot(new_ts_covid_canada$week[54:58],                              # Draw first time series
     new_ts_covid_canada$weekly_aggregated_residuals[54:58],
     type = "l",
     xlab = "Date",
     ylab = "Canada Prediction vs. Acutal: Full Model")
lines(new_ts_covid_canada$week[54:58],                            # Draw second time series
      final_prediction_2,
      type = "l",
      col = "red")
legend(x = "topright",legend=c("actual", "prediction"),
       col=c("black", "red"), lty=1, cex=0.5)

#Training MSE Canada Full Method 3
SSE_train = sum((final_prediction - compare_table$actual[1:length(final_prediction)])^2)
MSE_train = SSE_train/length(final_prediction)
MSE_train
## [1] 7515.24
#Testing MSE
SSE_test = sum((final_prediction_2 - compare_table$actual[length(final_prediction)+1:length(final_prediction_2)])^2)
MSE_test = SSE_test/length(final_prediction_2)
MSE_test
## [1] 8541.354
#Cote Example on aggregate weekly
ts_covid_cote = ts(covid_cote$residuals,frequency = 7)
ts_covid_cote_2 = ts(colSums(matrix(ts_covid_cote, nrow=7))) 
## Warning in matrix(ts_covid_cote, nrow = 7): data length [258] is not a sub-
## multiple or multiple of the number of rows [7]
vaccine_policy = colSums(matrix(as.numeric(covid_cote$vaccination_policy), nrow=7))
## Warning in matrix(as.numeric(covid_cote$vaccination_policy), nrow = 7): data
## length [258] is not a sub-multiple or multiple of the number of rows [7]
new_ts_covid_cote = data.frame(1:37,ts_covid_cote_2,vaccine_policy)
names(new_ts_covid_cote) = c("week","weekly_aggregated_residuals","vaccination_policy")

plot.ts(ts_covid_cote_2)

regression_model = lm(weekly_aggregated_residuals~1, data = new_ts_covid_cote[1:32,]) 
summary(regression_model)
## 
## Call:
## lm(formula = weekly_aggregated_residuals ~ 1, data = new_ts_covid_cote[1:32, 
##     ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -28.270 -21.875  -5.835   9.707  77.706 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -97.329      4.997  -19.48   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 28.27 on 31 degrees of freedom
anova(regression_model)
## Analysis of Variance Table
## 
## Response: weekly_aggregated_residuals
##           Df Sum Sq Mean Sq F value Pr(>F)
## Residuals 31  24771  799.08
error = residuals(regression_model)
ts_error = ts(error)

#ARIMA on error

error_model = auto.arima(ts_error)
fitted_error = fitted(error_model)
summary(error_model)
## Series: ts_error 
## ARIMA(0,1,0) 
## 
## sigma^2 estimated as 403:  log likelihood=-136.97
## AIC=275.94   AICc=276.08   BIC=277.38
## 
## Training set error measures:
##                       ME     RMSE      MAE      MPE     MAPE      MASE
## Training set -0.07958716 19.75977 11.62652 22.98514 109.0915 0.9687882
##                     ACF1
## Training set -0.05123007
checkresiduals(error_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,0)
## Q* = 8.6306, df = 6, p-value = 0.1954
## 
## Model df: 0.   Total lags used: 6
#goodness of fit
regression_prediction = predict(regression_model)

final_prediction = regression_prediction + fitted_error

regression_prediction = predict(regression_model, newdata = new_ts_covid_cote[33:37,])
error_forecast =  predict(error_model, n.ahead = 5)$pred

final_prediction_2 = regression_prediction + error_forecast


#compare prediction and actual


compare_table = data.frame(c(final_prediction,final_prediction_2),new_ts_covid_cote$weekly_aggregated_residuals[1:37])
colnames(compare_table) = c("predict","actual")


ggplot(data = new_ts_covid_cote[1:37,]) + 
  geom_line(aes(x=week, y=weekly_aggregated_residuals, color = "black")) + 
  geom_line(aes(x=week, y=compare_table$predict, color = "red"))+ 
    labs(x = "Week",
         y = "Cote Prediction vs. Acutal")+
  scale_color_manual(name = c("color"),values = c("black","red"),labels = c("Actual","Prediction"))+
  ggtitle("Null Model, ARIMA(0,1,0)")

plot(new_ts_covid_cote$week[33:37],                              # Draw first time series
     new_ts_covid_cote$weekly_aggregated_residuals[33:37],
     type = "l",
     xlab = "Date",
     ylab = "Cote Prediction vs. Acutal: Null Model")
lines(new_ts_covid_cote$week[33:37],                            # Draw second time series
      final_prediction_2,
      type = "l",
      col = "red")
legend(x = "topright",legend=c("actual", "prediction"),
       col=c("black", "red"), lty=1, cex=0.5)

#Training MSE Cote Null
SSE_train = sum((final_prediction - compare_table$actual[1:length(final_prediction)])^2)
MSE_train = SSE_train/length(final_prediction)
MSE_train
## [1] 390.4484
#Testing MSE
SSE_test = sum((final_prediction_2 - compare_table$actual[length(final_prediction)+1:length(final_prediction_2)])^2)
MSE_test = SSE_test/length(final_prediction_2)
MSE_test
## [1] 849.6274
regression_model = lm(weekly_aggregated_residuals~vaccination_policy, data = new_ts_covid_cote[1:32,]) 
summary(regression_model)
## 
## Call:
## lm(formula = weekly_aggregated_residuals ~ vaccination_policy, 
##     data = new_ts_covid_cote[1:32, ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -34.866 -11.376  -8.633   9.711  65.812 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -62.7813     8.9283  -7.032 8.14e-08 ***
## vaccination_policy  -1.1327     0.2619  -4.325 0.000155 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22.55 on 30 degrees of freedom
## Multiple R-squared:  0.384,  Adjusted R-squared:  0.3635 
## F-statistic:  18.7 on 1 and 30 DF,  p-value: 0.0001555
anova(regression_model)
## Analysis of Variance Table
## 
## Response: weekly_aggregated_residuals
##                    Df  Sum Sq Mean Sq F value    Pr(>F)    
## vaccination_policy  1  9512.3  9512.3  18.701 0.0001555 ***
## Residuals          30 15259.1   508.6                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
error = residuals(regression_model)
ts_error = ts(error)

#ARIMA on error

error_model = auto.arima(ts_error)
fitted_error = fitted(error_model)
summary(error_model)
## Series: ts_error 
## ARIMA(1,0,0) with zero mean 
## 
## Coefficients:
##          ar1
##       0.5846
## s.e.  0.1421
## 
## sigma^2 estimated as 322.1:  log likelihood=-137.5
## AIC=279.01   AICc=279.42   BIC=281.94
## 
## Training set error measures:
##                     ME     RMSE      MAE      MPE     MAPE      MASE       ACF1
## Training set 0.5298636 17.66441 10.70876 70.46478 73.72894 0.9174553 0.05083025
checkresiduals(error_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,0) with zero mean
## Q* = 5.675, df = 5, p-value = 0.3391
## 
## Model df: 1.   Total lags used: 6
#goodness of fit
regression_prediction = predict(regression_model)

final_prediction = regression_prediction + fitted_error

regression_prediction = predict(regression_model, newdata = new_ts_covid_cote[33:37,])
error_forecast =  predict(error_model, n.ahead = 5)$pred

final_prediction_2 = regression_prediction + error_forecast


#compare prediction and actual


compare_table = data.frame(c(final_prediction,final_prediction_2),new_ts_covid_cote$weekly_aggregated_residuals[1:37])
colnames(compare_table) = c("predict","actual")


ggplot(data = new_ts_covid_cote[1:37,]) + 
  geom_line(aes(x=week, y=weekly_aggregated_residuals, color = "black")) + 
  geom_line(aes(x=week, y=compare_table$predict, color = "red"))+ 
    labs(x = "Week",
         y = "Cote Prediction vs. Acutal")+
  scale_color_manual(name = c("color"),values = c("black","red"),labels = c("Actual","Prediction"))+
  ggtitle("Full Model, ARIMA(1,0,0)")

plot(new_ts_covid_cote$week[33:37],                              # Draw first time series
     new_ts_covid_cote$weekly_aggregated_residuals[33:37],
     type = "l",
     xlab = "Date",
     ylab = "Cote Prediction vs. Acutal: Full Model")
lines(new_ts_covid_cote$week[33:37],                            # Draw second time series
      final_prediction_2,
      type = "l",
      col = "red")
legend(x = "topright",legend=c("actual", "prediction"),
       col=c("black", "red"), lty=1, cex=0.5)

#Training MSE Cote Full
SSE_train = sum((final_prediction - compare_table$actual[1:length(final_prediction)])^2)
MSE_train = SSE_train/length(final_prediction)
MSE_train
## [1] 312.0312
#Testing MSE
SSE_test = sum((final_prediction_2 - compare_table$actual[length(final_prediction)+1:length(final_prediction_2)])^2)
MSE_test = SSE_test/length(final_prediction_2)
MSE_test
## [1] 319.3056
#Ghana Example on aggregate weekly
ts_covid_ghana = ts(covid_ghana$residuals,frequency = 7)
ts_covid_ghana_2 = ts(colSums(matrix(ts_covid_ghana, nrow=7))) 
## Warning in matrix(ts_covid_ghana, nrow = 7): data length [208] is not a sub-
## multiple or multiple of the number of rows [7]
vaccine_policy = colSums(matrix(as.numeric(covid_ghana$vaccination_policy), nrow=7))
## Warning in matrix(as.numeric(covid_ghana$vaccination_policy), nrow = 7): data
## length [208] is not a sub-multiple or multiple of the number of rows [7]
new_ts_covid_ghana = data.frame(1:30,ts_covid_ghana_2,vaccine_policy)
names(new_ts_covid_ghana) = c("week","weekly_aggregated_residuals","vaccination_policy")

plot.ts(ts_covid_ghana_2)

regression_model = lm(weekly_aggregated_residuals~1, data = new_ts_covid_ghana[1:25,]) 
summary(regression_model)
## 
## Call:
## lm(formula = weekly_aggregated_residuals ~ 1, data = new_ts_covid_ghana[1:25, 
##     ])
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -55.23 -38.87 -17.64  29.10 115.67 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -73.527      9.855  -7.461 1.06e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 49.27 on 24 degrees of freedom
anova(regression_model)
## Analysis of Variance Table
## 
## Response: weekly_aggregated_residuals
##           Df Sum Sq Mean Sq F value Pr(>F)
## Residuals 24  58267  2427.8
error = residuals(regression_model)
ts_error = ts(error)

#ARIMA on error

error_model = auto.arima(ts_error)
fitted_error = fitted(error_model)
summary(error_model)
## Series: ts_error 
## ARIMA(1,0,0) with zero mean 
## 
## Coefficients:
##          ar1
##       0.6933
## s.e.  0.1360
## 
## sigma^2 estimated as 1209:  log likelihood=-124.01
## AIC=252.02   AICc=252.56   BIC=254.45
## 
## Training set error measures:
##                    ME     RMSE      MAE      MPE     MAPE      MASE        ACF1
## Training set 1.162571 34.06465 26.80162 63.60824 92.10038 0.9803189 -0.05002308
checkresiduals(error_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,0) with zero mean
## Q* = 4.0198, df = 4, p-value = 0.4033
## 
## Model df: 1.   Total lags used: 5
#goodness of fit
regression_prediction = predict(regression_model)

final_prediction = regression_prediction + fitted_error

regression_prediction = predict(regression_model, newdata = new_ts_covid_ghana[26:30,])
error_forecast =  predict(error_model, n.ahead = 5)$pred

final_prediction_2 = regression_prediction + error_forecast


#compare prediction and actual


compare_table = data.frame(c(final_prediction,final_prediction_2),new_ts_covid_ghana$weekly_aggregated_residuals[1:30])
colnames(compare_table) = c("predict","actual")


ggplot(data = new_ts_covid_ghana[1:30,]) + 
  geom_line(aes(x=week, y=weekly_aggregated_residuals, color = "black")) + 
  geom_line(aes(x=week, y=compare_table$predict, color = "red"))+ 
    labs(x = "Week",
         y = "Ghana Prediction vs. Acutal")+
  scale_color_manual(name = c("color"),values = c("black","red"),labels = c("Actual","Prediction"))+
  ggtitle("Null Model, ARIMA(1,0,0)")

plot(new_ts_covid_ghana$week[26:30],                              # Draw first time series
     new_ts_covid_ghana$weekly_aggregated_residuals[26:30],
     type = "l",
     xlab = "Date",
     ylab = "Ghana Prediction vs. Acutal: Null Model")
lines(new_ts_covid_ghana$week[26:30],                            # Draw second time series
      final_prediction_2,
      type = "l",
      col = "red")
legend(x = "topright",legend=c("actual", "prediction"),
       col=c("black", "red"), lty=1, cex=0.5)

#Training MSE Ghana Null
SSE_train = sum((final_prediction - compare_table$actual[1:length(final_prediction)])^2)
MSE_train = SSE_train/length(final_prediction)
MSE_train
## [1] 1160.4
#Testing MSE
SSE_test = sum((final_prediction_2 - compare_table$actual[length(final_prediction)+1:length(final_prediction_2)])^2)
MSE_test = SSE_test/length(final_prediction_2)
MSE_test
## [1] 1152.268
regression_model = lm(weekly_aggregated_residuals~vaccination_policy, data = new_ts_covid_ghana[1:25,]) 
summary(regression_model)
## 
## Call:
## lm(formula = weekly_aggregated_residuals ~ vaccination_policy, 
##     data = new_ts_covid_ghana[1:25, ])
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -56.78 -37.01 -12.13  37.07 107.15 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)  
## (Intercept)        -44.9807    19.6074  -2.294   0.0313 *
## vaccination_policy  -1.6219     0.9743  -1.665   0.1095  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 47.55 on 23 degrees of freedom
## Multiple R-squared:  0.1075, Adjusted R-squared:  0.06874 
## F-statistic: 2.772 on 1 and 23 DF,  p-value: 0.1095
anova(regression_model)
## Analysis of Variance Table
## 
## Response: weekly_aggregated_residuals
##                    Df Sum Sq Mean Sq F value Pr(>F)
## vaccination_policy  1   6266  6266.4  2.7716 0.1095
## Residuals          23  52001  2260.9
error = residuals(regression_model)
ts_error = ts(error)

#ARIMA on error

error_model = auto.arima(ts_error)
fitted_error = fitted(error_model)
summary(error_model)
## Series: ts_error 
## ARIMA(1,0,0) with zero mean 
## 
## Coefficients:
##         ar1
##       0.668
## s.e.  0.148
## 
## sigma^2 estimated as 1206:  log likelihood=-123.95
## AIC=251.9   AICc=252.44   BIC=254.34
## 
## Training set error measures:
##                    ME     RMSE      MAE      MPE     MAPE      MASE        ACF1
## Training set 1.729828 34.02841 26.73081 248.3642 366.6443 0.9610988 -0.06799195
checkresiduals(error_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,0) with zero mean
## Q* = 3.2926, df = 4, p-value = 0.5101
## 
## Model df: 1.   Total lags used: 5
#goodness of fit
regression_prediction = predict(regression_model)

final_prediction = regression_prediction + fitted_error

regression_prediction = predict(regression_model, newdata = new_ts_covid_ghana[26:30,])
error_forecast =  predict(error_model, n.ahead = 5)$pred

final_prediction_2 = regression_prediction + error_forecast


#compare prediction and actual


compare_table = data.frame(c(final_prediction,final_prediction_2),new_ts_covid_ghana$weekly_aggregated_residuals[1:30])
colnames(compare_table) = c("predict","actual")


ggplot(data = new_ts_covid_ghana[1:30,]) + 
  geom_line(aes(x=week, y=weekly_aggregated_residuals, color = "black")) + 
  geom_line(aes(x=week, y=compare_table$predict, color = "red"))+ 
    labs(x = "Week",
         y = "Ghana Prediction vs. Acutal")+
  scale_color_manual(name = c("color"),values = c("black","red"),labels = c("Actual","Prediction"))+
  ggtitle("Full Model, ARIMA(1,0,0)")

plot(new_ts_covid_ghana$week[26:30],                              # Draw first time series
     new_ts_covid_ghana$weekly_aggregated_residuals[26:30],
     type = "l",
     xlab = "Date",
     ylab = "Ghana Prediction vs. Acutal: Full Model")
lines(new_ts_covid_ghana$week[26:30],                            # Draw second time series
      final_prediction_2,
      type = "l",
      col = "red")
legend(x = "topright",legend=c("actual", "prediction"),
       col=c("black", "red"), lty=1, cex=0.5)

#Training MSE Ghana Full
SSE_train = sum((final_prediction - compare_table$actual[1:length(final_prediction)])^2)
MSE_train = SSE_train/length(final_prediction)
MSE_train
## [1] 1157.933
#Testing MSE
SSE_test = sum((final_prediction_2 - compare_table$actual[length(final_prediction)+1:length(final_prediction_2)])^2)
MSE_test = SSE_test/length(final_prediction_2)
MSE_test
## [1] 745.748
#Guatemala Example on aggregate weekly
ts_covid_guatemala = ts(covid_guatemala$residuals,frequency = 7)
ts_covid_guatemala_2 = ts(colSums(matrix(ts_covid_guatemala, nrow=7))) 
## Warning in matrix(ts_covid_guatemala, nrow = 7): data length [374] is not a sub-
## multiple or multiple of the number of rows [7]
vaccine_policy = colSums(matrix(as.numeric(covid_guatemala$vaccination_policy), nrow=7))
## Warning in matrix(as.numeric(covid_guatemala$vaccination_policy), nrow = 7):
## data length [374] is not a sub-multiple or multiple of the number of rows [7]
new_ts_covid_guatemala = data.frame(1:54,ts_covid_guatemala_2,vaccine_policy)
names(new_ts_covid_guatemala) = c("week","weekly_aggregated_residuals","vaccination_policy")

plot.ts(ts_covid_guatemala_2)

regression_model = lm(weekly_aggregated_residuals~1, data = new_ts_covid_guatemala[1:49,]) 
summary(regression_model)
## 
## Call:
## lm(formula = weekly_aggregated_residuals ~ 1, data = new_ts_covid_guatemala[1:49, 
##     ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -286.23 -257.64 -170.09   57.71  970.71 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -247.87      51.24  -4.837  1.4e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 358.7 on 48 degrees of freedom
anova(regression_model)
## Analysis of Variance Table
## 
## Response: weekly_aggregated_residuals
##           Df  Sum Sq Mean Sq F value Pr(>F)
## Residuals 48 6176129  128669
error = residuals(regression_model)
ts_error = ts(error)

#ARIMA on error

error_model = auto.arima(ts_error)
fitted_error = fitted(error_model)
summary(error_model)
## Series: ts_error 
## ARIMA(0,1,1) 
## 
## Coefficients:
##          ma1
##       0.3873
## s.e.  0.1313
## 
## sigma^2 estimated as 5577:  log likelihood=-274.72
## AIC=553.44   AICc=553.71   BIC=557.18
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE     MAPE    MASE       ACF1
## Training set 12.37604 73.14104 49.42821 -11.06218 57.80992 0.90031 -0.0148057
checkresiduals(error_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,1)
## Q* = 4.5724, df = 9, p-value = 0.8699
## 
## Model df: 1.   Total lags used: 10
#goodness of fit
regression_prediction = predict(regression_model)

final_prediction = regression_prediction + fitted_error

regression_prediction = predict(regression_model, newdata = new_ts_covid_guatemala[50:54,])
error_forecast =  predict(error_model, n.ahead = 5)$pred

final_prediction_2 = regression_prediction + error_forecast


#compare prediction and actual


compare_table = data.frame(c(final_prediction,final_prediction_2),new_ts_covid_guatemala$weekly_aggregated_residuals[1:54])
colnames(compare_table) = c("predict","actual")


ggplot(data = new_ts_covid_guatemala[1:54,]) + 
  geom_line(aes(x=week, y=weekly_aggregated_residuals, color = "black")) + 
  geom_line(aes(x=week, y=compare_table$predict, color = "red"))+ 
    labs(x = "Week",
         y = "Guatemala Prediction vs. Acutal")+
  scale_color_manual(name = c("color"),values = c("black","red"),labels = c("Actual","Prediction"))+
  ggtitle("Null Model, ARIMA(0,1,1)")

plot(new_ts_covid_guatemala$week[50:54],                              # Draw first time series
     new_ts_covid_guatemala$weekly_aggregated_residuals[50:54],
     type = "l",
     xlab = "Date",
     ylab = "Guatemala Prediction vs. Acutal: Null Model")
lines(new_ts_covid_guatemala$week[50:54],                            # Draw second time series
      final_prediction_2,
      type = "l",
      col = "red")
legend(x = "topright",legend=c("actual", "prediction"),
       col=c("black", "red"), lty=1, cex=0.5)

#Training MSE Guaremala Null
SSE_train = sum((final_prediction - compare_table$actual[1:length(final_prediction)])^2)
MSE_train = SSE_train/length(final_prediction)
MSE_train
## [1] 5349.611
#Testing MSE
SSE_test = sum((final_prediction_2 - compare_table$actual[length(final_prediction)+1:length(final_prediction_2)])^2)
MSE_test = SSE_test/length(final_prediction_2)
MSE_test
## [1] 200715
regression_model = lm(weekly_aggregated_residuals~vaccination_policy, data = new_ts_covid_guatemala[1:49,]) 
summary(regression_model)
## 
## Call:
## lm(formula = weekly_aggregated_residuals ~ vaccination_policy, 
##     data = new_ts_covid_guatemala[1:49, ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -514.25  -47.85   40.39   65.28  524.37 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -753.607     58.086  -12.97  < 2e-16 ***
## vaccination_policy   27.202      2.702   10.07 2.59e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 204.1 on 47 degrees of freedom
## Multiple R-squared:  0.6831, Adjusted R-squared:  0.6764 
## F-statistic: 101.3 on 1 and 47 DF,  p-value: 2.589e-13
anova(regression_model)
## Analysis of Variance Table
## 
## Response: weekly_aggregated_residuals
##                    Df  Sum Sq Mean Sq F value    Pr(>F)    
## vaccination_policy  1 4219091 4219091  101.33 2.589e-13 ***
## Residuals          47 1957038   41639                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
error = residuals(regression_model)
ts_error = ts(error)

#ARIMA on error

error_model = auto.arima(ts_error)
fitted_error = fitted(error_model)
summary(error_model)
## Series: ts_error 
## ARIMA(2,0,0) with zero mean 
## 
## Coefficients:
##          ar1      ar2
##       1.1469  -0.3687
## s.e.  0.1300   0.1310
## 
## sigma^2 estimated as 10191:  log likelihood=-295.37
## AIC=596.75   AICc=597.28   BIC=602.42
## 
## Training set error measures:
##                     ME   RMSE      MAE       MPE     MAPE      MASE
## Training set 0.4284951 98.867 68.54279 -4.273555 118.4029 0.9044698
##                      ACF1
## Training set 0.0005101659
checkresiduals(error_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,0,0) with zero mean
## Q* = 5.8776, df = 8, p-value = 0.6609
## 
## Model df: 2.   Total lags used: 10
#goodness of fit
regression_prediction = predict(regression_model)

final_prediction = regression_prediction + fitted_error

regression_prediction = predict(regression_model, newdata = new_ts_covid_guatemala[50:54,])
error_forecast =  predict(error_model, n.ahead = 5)$pred

final_prediction_2 = regression_prediction + error_forecast


#compare prediction and actual


compare_table = data.frame(c(final_prediction,final_prediction_2),new_ts_covid_guatemala$weekly_aggregated_residuals[1:54])
colnames(compare_table) = c("predict","actual")


ggplot(data = new_ts_covid_guatemala[1:54,]) + 
  geom_line(aes(x=week, y=weekly_aggregated_residuals, color = "black")) + 
  geom_line(aes(x=week, y=compare_table$predict, color = "red"))+ 
    labs(x = "Week",
         y = "Guatemala Prediction vs. Acutal")+
  scale_color_manual(name = c("color"),values = c("black","red"),labels = c("Actual","Prediction"))+
  ggtitle("Full Model, ARIMA(2,0,0)")

plot(new_ts_covid_guatemala$week[50:54],                              # Draw first time series
     new_ts_covid_guatemala$weekly_aggregated_residuals[50:54],
     type = "l",
     xlab = "Date",
     ylab = "Guatemala Prediction vs. Acutal: Full Model")
lines(new_ts_covid_guatemala$week[50:54],                            # Draw second time series
      final_prediction_2,
      type = "l",
      col = "red")
legend(x = "topright",legend=c("actual", "prediction"),
       col=c("black", "red"), lty=1, cex=0.5)

#Training MSE Guatemala Full
SSE_train = sum((final_prediction - compare_table$actual[1:length(final_prediction)])^2)
MSE_train = SSE_train/length(final_prediction)
MSE_train
## [1] 9774.683
#Testing MSE
SSE_test = sum((final_prediction_2 - compare_table$actual[length(final_prediction)+1:length(final_prediction_2)])^2)
MSE_test = SSE_test/length(final_prediction_2)
MSE_test
## [1] 56430.54
#India Example on aggregate weekly
ts_covid_india = ts(covid_india$residuals,frequency = 7)
ts_covid_india_2 = ts(colSums(matrix(ts_covid_india, nrow=7))) 
## Warning in matrix(ts_covid_india, nrow = 7): data length [362] is not a sub-
## multiple or multiple of the number of rows [7]
vaccine_policy = colSums(matrix(as.numeric(covid_india$vaccination_policy), nrow=7))
## Warning in matrix(as.numeric(covid_india$vaccination_policy), nrow = 7): data
## length [362] is not a sub-multiple or multiple of the number of rows [7]
new_ts_covid_india = data.frame(1:52,ts_covid_india_2,vaccine_policy)
names(new_ts_covid_india) = c("week","weekly_aggregated_residuals","vaccination_policy")

plot.ts(ts_covid_india_2)

regression_model = lm(weekly_aggregated_residuals~1, data = new_ts_covid_india[1:47,]) 
summary(regression_model)
## 
## Call:
## lm(formula = weekly_aggregated_residuals ~ 1, data = new_ts_covid_india[1:47, 
##     ])
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -268.9 -245.8 -214.3 -101.3 1534.2 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)   138.25      69.31   1.995    0.052 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 475.2 on 46 degrees of freedom
anova(regression_model)
## Analysis of Variance Table
## 
## Response: weekly_aggregated_residuals
##           Df   Sum Sq Mean Sq F value Pr(>F)
## Residuals 46 10387457  225814
error = residuals(regression_model)
ts_error = ts(error)

#ARIMA on error

error_model = auto.arima(ts_error)
fitted_error = fitted(error_model)
summary(error_model)
## Series: ts_error 
## ARIMA(2,0,0) with zero mean 
## 
## Coefficients:
##          ar1      ar2
##       1.7204  -0.8305
## s.e.  0.0697   0.0692
## 
## sigma^2 estimated as 6969:  log likelihood=-275.87
## AIC=557.74   AICc=558.3   BIC=563.29
## 
## Training set error measures:
##                     ME     RMSE     MAE     MPE     MAPE      MASE      ACF1
## Training set 0.1572028 81.68453 48.6424 -58.948 82.23303 0.5681809 0.2240344
checkresiduals(error_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,0,0) with zero mean
## Q* = 14.925, df = 7, p-value = 0.03697
## 
## Model df: 2.   Total lags used: 9
#goodness of fit
regression_prediction = predict(regression_model)

final_prediction = regression_prediction + fitted_error

regression_prediction = predict(regression_model, newdata = new_ts_covid_india[48:52,])
error_forecast =  predict(error_model, n.ahead = 5)$pred

final_prediction_2 = regression_prediction + error_forecast


#compare prediction and actual


compare_table = data.frame(c(final_prediction,final_prediction_2),new_ts_covid_india$weekly_aggregated_residuals[1:52])
colnames(compare_table) = c("predict","actual")


ggplot(data = new_ts_covid_india[1:52,]) + 
  geom_line(aes(x=week, y=weekly_aggregated_residuals, color = "black")) + 
  geom_line(aes(x=week, y=compare_table$predict, color = "red"))+ 
    labs(x = "Week",
         y = "India Prediction vs. Acutal")+
  scale_color_manual(name = c("color"),values = c("black","red"),labels = c("Actual","Prediction"))+
  ggtitle("Null Model, ARIMA(2,0,0)")

plot(new_ts_covid_india$week[48:52],                              # Draw first time series
     new_ts_covid_india$weekly_aggregated_residuals[48:52],
     type = "l",
     xlab = "Date",
     ylab = "India Prediction vs. Acutal: Null Model")
lines(new_ts_covid_india$week[48:52],                            # Draw second time series
      final_prediction_2,
      type = "l",
      col = "red")
legend(x = "topright",legend=c("actual", "prediction"),
       col=c("black", "red"), lty=1, cex=0.5)

#Training MSE India Null
SSE_train = sum((final_prediction - compare_table$actual[1:length(final_prediction)])^2)
MSE_train = SSE_train/length(final_prediction)
MSE_train
## [1] 6672.362
#Testing MSE
SSE_test = sum((final_prediction_2 - compare_table$actual[length(final_prediction)+1:length(final_prediction_2)])^2)
MSE_test = SSE_test/length(final_prediction_2)
MSE_test
## [1] 34539.97
regression_model = lm(weekly_aggregated_residuals~vaccination_policy, data = new_ts_covid_india[1:47,]) 
summary(regression_model)
## 
## Call:
## lm(formula = weekly_aggregated_residuals ~ vaccination_policy, 
##     data = new_ts_covid_india[1:47, ])
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -457.7 -322.2 -128.8   89.6 1414.9 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)  
## (Intercept)        -175.800    153.607  -1.144   0.2585  
## vaccination_policy   12.383      5.462   2.267   0.0282 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 455.2 on 45 degrees of freedom
## Multiple R-squared:  0.1025, Adjusted R-squared:  0.08257 
## F-statistic:  5.14 on 1 and 45 DF,  p-value: 0.02823
anova(regression_model)
## Analysis of Variance Table
## 
## Response: weekly_aggregated_residuals
##                    Df  Sum Sq Mean Sq F value  Pr(>F)  
## vaccination_policy  1 1064901 1064901  5.1403 0.02823 *
## Residuals          45 9322556  207168                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
error = residuals(regression_model)
ts_error = ts(error)

#ARIMA on error

error_model = auto.arima(ts_error)
fitted_error = fitted(error_model)
summary(error_model)
## Series: ts_error 
## ARIMA(2,0,2) with zero mean 
## 
## Coefficients:
##          ar1      ar2     ma1     ma2
##       1.5202  -0.6679  0.3237  0.3316
## s.e.  0.1507   0.1446  0.2118  0.1545
## 
## sigma^2 estimated as 7498:  log likelihood=-276.62
## AIC=563.23   AICc=564.7   BIC=572.48
## 
## Training set error measures:
##                      ME     RMSE      MAE      MPE     MAPE      MASE
## Training set -0.9512028 82.82397 58.35076 10.92665 37.01834 0.6452885
##                    ACF1
## Training set 0.02990413
checkresiduals(error_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,0,2) with zero mean
## Q* = 4.4603, df = 5, p-value = 0.4852
## 
## Model df: 4.   Total lags used: 9
#goodness of fit
regression_prediction = predict(regression_model)

final_prediction = regression_prediction + fitted_error

regression_prediction = predict(regression_model, newdata = new_ts_covid_india[48:52,])
error_forecast =  predict(error_model, n.ahead = 5)$pred

final_prediction_2 = regression_prediction + error_forecast


#compare prediction and actual


compare_table = data.frame(c(final_prediction,final_prediction_2),new_ts_covid_india$weekly_aggregated_residuals[1:52])
colnames(compare_table) = c("predict","actual")


ggplot(data = new_ts_covid_india[1:52,]) + 
  geom_line(aes(x=week, y=weekly_aggregated_residuals, color = "black")) + 
  geom_line(aes(x=week, y=compare_table$predict, color = "red"))+ 
    labs(x = "Week",
         y = "India Prediction vs. Acutal")+
  scale_color_manual(name = c("color"),values = c("black","red"),labels = c("Actual","Prediction"))+
  ggtitle("Full Model, ARIMA(2,0,2)")

plot(new_ts_covid_india$week[48:52],                              # Draw first time series
     new_ts_covid_india$weekly_aggregated_residuals[48:52],
     type = "l",
     xlab = "Date",
     ylab = "India Prediction vs. Acutal: Full Model")
lines(new_ts_covid_india$week[48:52],                            # Draw second time series
      final_prediction_2,
      type = "l",
      col = "red")
legend(x = "topright",legend=c("actual", "prediction"),
       col=c("black", "red"), lty=1, cex=0.5)

#Training MSE India Full
SSE_train = sum((final_prediction - compare_table$actual[1:length(final_prediction)])^2)
MSE_train = SSE_train/length(final_prediction)
MSE_train
## [1] 6859.81
#Testing MSE
SSE_test = sum((final_prediction_2 - compare_table$actual[length(final_prediction)+1:length(final_prediction_2)])^2)
MSE_test = SSE_test/length(final_prediction_2)
MSE_test
## [1] 99543.38
#Indonesia Example on aggregate weekly
ts_covid_indonesia = ts(covid_indonesia$residuals,frequency = 7)
ts_covid_indonesia_2 = ts(colSums(matrix(ts_covid_indonesia, nrow=7))) 
## Warning in matrix(ts_covid_indonesia, nrow = 7): data length [219] is not a sub-
## multiple or multiple of the number of rows [7]
vaccine_policy = colSums(matrix(as.numeric(covid_indonesia$vaccination_policy), nrow=7))
## Warning in matrix(as.numeric(covid_indonesia$vaccination_policy), nrow = 7):
## data length [219] is not a sub-multiple or multiple of the number of rows [7]
new_ts_covid_indonesia = data.frame(1:32,ts_covid_indonesia_2,vaccine_policy)
names(new_ts_covid_indonesia) = c("week","weekly_aggregated_residuals","vaccination_policy")

plot.ts(ts_covid_indonesia_2)

regression_model = lm(weekly_aggregated_residuals~1, data = new_ts_covid_indonesia[1:27,]) 
summary(regression_model)
## 
## Call:
## lm(formula = weekly_aggregated_residuals ~ 1, data = new_ts_covid_indonesia[1:27, 
##     ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -102.43  -65.99  -52.22   39.94  390.97 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)    61.55      20.92   2.942  0.00677 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 108.7 on 26 degrees of freedom
anova(regression_model)
## Analysis of Variance Table
## 
## Response: weekly_aggregated_residuals
##           Df Sum Sq Mean Sq F value Pr(>F)
## Residuals 26 307203   11816
error = residuals(regression_model)
ts_error = ts(error)

#ARIMA on error

error_model = auto.arima(ts_error)
fitted_error = fitted(error_model)
summary(error_model)
## Series: ts_error 
## ARIMA(1,0,2) with non-zero mean 
## 
## Coefficients:
##          ar1     ma1     ma2      mean
##       0.8774  0.8697  0.9756   75.1264
## s.e.  0.1239  0.4897  1.0765  134.9435
## 
## sigma^2 estimated as 1480:  log likelihood=-138.79
## AIC=287.59   AICc=290.45   BIC=294.07
## 
## Training set error measures:
##                    ME     RMSE      MAE      MPE     MAPE      MASE      ACF1
## Training set 3.874938 35.50205 25.14409 22.28285 58.36999 0.6683383 0.1219358
checkresiduals(error_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,2) with non-zero mean
## Q* = 1.8983, df = 3, p-value = 0.5938
## 
## Model df: 4.   Total lags used: 7
#goodness of fit
regression_prediction = predict(regression_model)

final_prediction = regression_prediction + fitted_error

regression_prediction = predict(regression_model, newdata = new_ts_covid_indonesia[28:32,])
error_forecast =  predict(error_model, n.ahead = 5)$pred

final_prediction_2 = regression_prediction + error_forecast


#compare prediction and actual


compare_table = data.frame(c(final_prediction,final_prediction_2),new_ts_covid_indonesia$weekly_aggregated_residuals[1:32])
colnames(compare_table) = c("predict","actual")


ggplot(data = new_ts_covid_indonesia[1:32,]) + 
  geom_line(aes(x=week, y=weekly_aggregated_residuals, color = "black")) + 
  geom_line(aes(x=week, y=compare_table$predict, color = "red"))+ 
    labs(x = "Week",
         y = "Indonesia Prediction vs. Acutal")+
  scale_color_manual(name = c("color"),values = c("black","red"),labels = c("Actual","Prediction"))+
  ggtitle("Null Model, ARIMA(2,0,2)")

plot(new_ts_covid_indonesia$week[28:32],                              # Draw first time series
     new_ts_covid_indonesia$weekly_aggregated_residuals[28:32],
     type = "l",
     xlab = "Date",
     ylab = "Indonesia Prediction vs. Acutal: Null Model")
lines(new_ts_covid_indonesia$week[28:32],                            # Draw second time series
      final_prediction_2,
      type = "l",
      col = "red")
legend(x = "topright",legend=c("actual", "prediction"),
       col=c("black", "red"), lty=1, cex=0.5)

#Training MSE Indonesia Null
SSE_train = sum((final_prediction - compare_table$actual[1:length(final_prediction)])^2)
MSE_train = SSE_train/length(final_prediction)
MSE_train
## [1] 1260.395
#Testing MSE
SSE_test = sum((final_prediction_2 - compare_table$actual[length(final_prediction)+1:length(final_prediction_2)])^2)
MSE_test = SSE_test/length(final_prediction_2)
MSE_test
## [1] 129437.1
regression_model = lm(weekly_aggregated_residuals~vaccination_policy, data = new_ts_covid_indonesia[1:27,]) 
summary(regression_model)
## 
## Call:
## lm(formula = weekly_aggregated_residuals ~ vaccination_policy, 
##     data = new_ts_covid_indonesia[1:27, ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -113.82  -75.13  -39.24   38.97  370.13 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)
## (Intercept)         -16.050     56.956  -0.282    0.780
## vaccination_policy    4.687      3.210   1.460    0.157
## 
## Residual standard error: 106.4 on 25 degrees of freedom
## Multiple R-squared:  0.07858,    Adjusted R-squared:  0.04172 
## F-statistic: 2.132 on 1 and 25 DF,  p-value: 0.1567
anova(regression_model)
## Analysis of Variance Table
## 
## Response: weekly_aggregated_residuals
##                    Df Sum Sq Mean Sq F value Pr(>F)
## vaccination_policy  1  24140   24140   2.132 0.1567
## Residuals          25 283063   11322
error = residuals(regression_model)
ts_error = ts(error)

#ARIMA on error

error_model = auto.arima(ts_error)
fitted_error = fitted(error_model)
summary(error_model)
## Series: ts_error 
## ARIMA(0,0,2) with zero mean 
## 
## Coefficients:
##          ma1     ma2
##       1.1193  0.9546
## s.e.  0.2620  0.4744
## 
## sigma^2 estimated as 2936:  log likelihood=-147.39
## AIC=300.78   AICc=301.82   BIC=304.66
## 
## Training set error measures:
##                    ME     RMSE      MAE      MPE     MAPE     MASE     ACF1
## Training set 5.115449 52.14034 36.28781 25.01186 61.62268 0.995775 0.361771
checkresiduals(error_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,0,2) with zero mean
## Q* = 6.776, df = 3, p-value = 0.07939
## 
## Model df: 2.   Total lags used: 5
#goodness of fit
regression_prediction = predict(regression_model)

final_prediction = regression_prediction + fitted_error

regression_prediction = predict(regression_model, newdata = new_ts_covid_indonesia[28:32,])
error_forecast =  predict(error_model, n.ahead = 5)$pred

final_prediction_2 = regression_prediction + error_forecast


#compare prediction and actual


compare_table = data.frame(c(final_prediction,final_prediction_2),new_ts_covid_indonesia$weekly_aggregated_residuals[1:32])
colnames(compare_table) = c("predict","actual")


ggplot(data = new_ts_covid_indonesia[1:32,]) + 
  geom_line(aes(x=week, y=weekly_aggregated_residuals, color = "black")) + 
  geom_line(aes(x=week, y=compare_table$predict, color = "red"))+ 
    labs(x = "Week",
         y = "Indonesia Prediction vs. Acutal")+
  scale_color_manual(name = c("color"),values = c("black","red"),labels = c("Actual","Prediction"))+
  ggtitle("Full Model, ARIMA(2,0,1)")

plot(new_ts_covid_indonesia$week[28:32],                              # Draw first time series
     new_ts_covid_indonesia$weekly_aggregated_residuals[28:32],
     type = "l",
     xlab = "Date",
     ylab = "Indonesia Prediction vs. Acutal: Full Model")
lines(new_ts_covid_indonesia$week[28:32],                            # Draw second time series
      final_prediction_2,
      type = "l",
      col = "red")
legend(x = "topright",legend=c("actual", "prediction"),
       col=c("black", "red"), lty=1, cex=0.5)

#Training MSE Indonesia Full
SSE_train = sum((final_prediction - compare_table$actual[1:length(final_prediction)])^2)
MSE_train = SSE_train/length(final_prediction)
MSE_train
## [1] 2718.615
#Testing MSE
SSE_test = sum((final_prediction_2 - compare_table$actual[length(final_prediction)+1:length(final_prediction_2)])^2)
MSE_test = SSE_test/length(final_prediction_2)
MSE_test
## [1] 223047.2
#Italy Example on aggregate weekly
ts_covid_italy = ts(covid_italy$residuals,frequency = 7)
ts_covid_italy_2 = ts(colSums(matrix(ts_covid_italy, nrow=7))) 
## Warning in matrix(ts_covid_italy, nrow = 7): data length [395] is not a sub-
## multiple or multiple of the number of rows [7]
vaccine_policy = colSums(matrix(as.numeric(covid_italy$vaccination_policy), nrow=7))
## Warning in matrix(as.numeric(covid_italy$vaccination_policy), nrow = 7): data
## length [395] is not a sub-multiple or multiple of the number of rows [7]
new_ts_covid_italy = data.frame(1:57,ts_covid_italy_2,vaccine_policy)
names(new_ts_covid_italy) = c("week","weekly_aggregated_residuals","vaccination_policy")

plot.ts(ts_covid_italy_2)

regression_model = lm(weekly_aggregated_residuals~1, data = new_ts_covid_italy[1:52,]) 
summary(regression_model)
## 
## Call:
## lm(formula = weekly_aggregated_residuals ~ 1, data = new_ts_covid_italy[1:52, 
##     ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1181.19  -838.82   -59.01   633.30  2672.01 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)    67.37     138.80   0.485    0.629
## 
## Residual standard error: 1001 on 51 degrees of freedom
anova(regression_model)
## Analysis of Variance Table
## 
## Response: weekly_aggregated_residuals
##           Df   Sum Sq Mean Sq F value Pr(>F)
## Residuals 51 51089085 1001747
error = residuals(regression_model)
ts_error = ts(error)

#ARIMA on error

error_model = auto.arima(ts_error)
fitted_error = fitted(error_model)
summary(error_model)
## Series: ts_error 
## ARIMA(2,1,1) 
## 
## Coefficients:
##          ar1      ar2      ma1
##       1.5750  -0.7912  -0.7692
## s.e.  0.1159   0.0932   0.1590
## 
## sigma^2 estimated as 42989:  log likelihood=-343.68
## AIC=695.37   AICc=696.24   BIC=703.1
## 
## Training set error measures:
##                     ME     RMSE      MAE       MPE   MAPE      MASE       ACF1
## Training set -13.91981 199.2044 139.7122 -15.69634 84.642 0.6064991 0.05854227
checkresiduals(error_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,1,1)
## Q* = 6.9893, df = 7, p-value = 0.43
## 
## Model df: 3.   Total lags used: 10
#goodness of fit
regression_prediction = predict(regression_model)

final_prediction = regression_prediction + fitted_error

regression_prediction = predict(regression_model, newdata = new_ts_covid_italy[53:57,])
error_forecast =  predict(error_model, n.ahead = 5)$pred

final_prediction_2 = regression_prediction + error_forecast


#compare prediction and actual


compare_table = data.frame(c(final_prediction,final_prediction_2),new_ts_covid_italy$weekly_aggregated_residuals[1:57])
colnames(compare_table) = c("predict","actual")


ggplot(data = new_ts_covid_italy[1:57,]) + 
  geom_line(aes(x=week, y=weekly_aggregated_residuals, color = "black")) + 
  geom_line(aes(x=week, y=compare_table$predict, color = "red"))+ 
    labs(x = "Week",
         y = "Italy Prediction vs. Acutal")+
  scale_color_manual(name = c("color"),values = c("black","red"),labels = c("Actual","Prediction"))+
  ggtitle("Null Model, ARIMA(2,1,1)")

plot(new_ts_covid_italy$week[53:57],                              # Draw first time series
     new_ts_covid_italy$weekly_aggregated_residuals[53:57],
     type = "l",
     xlab = "Date",
     ylab = "Italy Prediction vs. Acutal: Null Model")
lines(new_ts_covid_italy$week[53:57],                            # Draw second time series
      final_prediction_2,
      type = "l",
      col = "red")
legend(x = "topright",legend=c("actual", "prediction"),
       col=c("black", "red"), lty=1, cex=0.5)

#Training MSE Italy Null
SSE_train = sum((final_prediction - compare_table$actual[1:length(final_prediction)])^2)
MSE_train = SSE_train/length(final_prediction)
MSE_train
## [1] 39682.38
#Testing MSE
SSE_test = sum((final_prediction_2 - compare_table$actual[length(final_prediction)+1:length(final_prediction_2)])^2)
MSE_test = SSE_test/length(final_prediction_2)
MSE_test
## [1] 98947.38
regression_model = lm(weekly_aggregated_residuals~vaccination_policy, data = new_ts_covid_italy[1:52,]) 
summary(regression_model)
## 
## Call:
## lm(formula = weekly_aggregated_residuals ~ vaccination_policy, 
##     data = new_ts_covid_italy[1:52, ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1901.06  -331.13   -38.52   350.53  1605.76 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        1540.025    195.017   7.897  2.4e-10 ***
## vaccination_policy  -58.057      6.828  -8.503  2.8e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 646.3 on 50 degrees of freedom
## Multiple R-squared:  0.5912, Adjusted R-squared:  0.583 
## F-statistic: 72.29 on 1 and 50 DF,  p-value: 2.805e-11
anova(regression_model)
## Analysis of Variance Table
## 
## Response: weekly_aggregated_residuals
##                    Df   Sum Sq  Mean Sq F value    Pr(>F)    
## vaccination_policy  1 30201415 30201415  72.295 2.805e-11 ***
## Residuals          50 20887669   417753                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
error = residuals(regression_model)
ts_error = ts(error)

#ARIMA on error

error_model = auto.arima(ts_error)
fitted_error = fitted(error_model)
summary(error_model)
## Series: ts_error 
## ARIMA(2,0,0) with zero mean 
## 
## Coefficients:
##          ar1      ar2
##       1.5292  -0.8207
## s.e.  0.0813   0.0850
## 
## sigma^2 estimated as 47627:  log likelihood=-354.55
## AIC=715.09   AICc=715.59   BIC=720.94
## 
## Training set error measures:
##                      ME     RMSE      MAE     MPE     MAPE      MASE
## Training set -0.2823539 213.9979 154.6673 -537.84 698.1204 0.6331228
##                     ACF1
## Training set -0.08598986
checkresiduals(error_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,0,0) with zero mean
## Q* = 5.0949, df = 8, p-value = 0.7474
## 
## Model df: 2.   Total lags used: 10
#goodness of fit
regression_prediction = predict(regression_model)

final_prediction = regression_prediction + fitted_error

regression_prediction = predict(regression_model, newdata = new_ts_covid_italy[53:57,])
error_forecast =  predict(error_model, n.ahead = 5)$pred

final_prediction_2 = regression_prediction + error_forecast


#compare prediction and actual


compare_table = data.frame(c(final_prediction,final_prediction_2),new_ts_covid_italy$weekly_aggregated_residuals[1:57])
colnames(compare_table) = c("predict","actual")


ggplot(data = new_ts_covid_italy[1:57,]) + 
  geom_line(aes(x=week, y=weekly_aggregated_residuals, color = "black")) + 
  geom_line(aes(x=week, y=compare_table$predict, color = "red"))+ 
    labs(x = "Week",
         y = "Italy Prediction vs. Acutal")+
  scale_color_manual(name = c("color"),values = c("black","red"),labels = c("Actual","Prediction"))+
  ggtitle("Full Model, ARIMA(2,0,0)")

plot(new_ts_covid_italy$week[53:57],                              # Draw first time series
     new_ts_covid_italy$weekly_aggregated_residuals[53:57],
     type = "l",
     xlab = "Date",
     ylab = "Italy Prediction vs. Acutal: Full Model")
lines(new_ts_covid_italy$week[53:57],                            # Draw second time series
      final_prediction_2,
      type = "l",
      col = "red")
legend(x = "topright",legend=c("actual", "prediction"),
       col=c("black", "red"), lty=1, cex=0.5)

#Training MSE Italy Full
SSE_train = sum((final_prediction - compare_table$actual[1:length(final_prediction)])^2)
MSE_train = SSE_train/length(final_prediction)
MSE_train
## [1] 45795.1
#Testing MSE
SSE_test = sum((final_prediction_2 - compare_table$actual[length(final_prediction)+1:length(final_prediction_2)])^2)
MSE_test = SSE_test/length(final_prediction_2)
MSE_test
## [1] 493516.2
#Japan Example on aggregate weekly
ts_covid_japan = ts(covid_japan$residuals,frequency = 7)
ts_covid_japan_2 = ts(colSums(matrix(ts_covid_japan, nrow=7))) 

vaccine_policy = colSums(matrix(as.numeric(covid_japan$vaccination_policy), nrow=7))
new_ts_covid_japan = data.frame(1:50,ts_covid_japan_2,vaccine_policy)
names(new_ts_covid_japan) = c("week","weekly_aggregated_residuals","vaccination_policy")

plot.ts(ts_covid_japan_2)

regression_model = lm(weekly_aggregated_residuals~1, data = new_ts_covid_japan[1:45,]) 
summary(regression_model)
## 
## Call:
## lm(formula = weekly_aggregated_residuals ~ 1, data = new_ts_covid_japan[1:45, 
##     ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -206.53 -158.08  -98.11    7.20  894.92 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)    96.09      40.11   2.396   0.0209 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 269 on 44 degrees of freedom
anova(regression_model)
## Analysis of Variance Table
## 
## Response: weekly_aggregated_residuals
##           Df  Sum Sq Mean Sq F value Pr(>F)
## Residuals 44 3184844   72383
error = residuals(regression_model)
ts_error = ts(error)

#ARIMA on error

error_model = auto.arima(ts_error)
fitted_error = fitted(error_model)
summary(error_model)
## Series: ts_error 
## ARIMA(2,1,1) 
## 
## Coefficients:
##          ar1      ar2      ma1
##       1.5856  -0.8366  -0.7681
## s.e.  0.1005   0.0792   0.1809
## 
## sigma^2 estimated as 4095:  log likelihood=-244.91
## AIC=497.82   AICc=498.84   BIC=504.95
## 
## Training set error measures:
##                    ME     RMSE      MAE      MPE     MAPE      MASE       ACF1
## Training set 8.743547 61.07848 36.10528 160.3919 226.4182 0.5201579 0.09833099
checkresiduals(error_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,1,1)
## Q* = 3.7162, df = 6, p-value = 0.715
## 
## Model df: 3.   Total lags used: 9
#goodness of fit
regression_prediction = predict(regression_model)

final_prediction = regression_prediction + fitted_error

regression_prediction = predict(regression_model, newdata = new_ts_covid_japan[46:50,])
error_forecast =  predict(error_model, n.ahead = 5)$pred

final_prediction_2 = regression_prediction + error_forecast


#compare prediction and actual


compare_table = data.frame(c(final_prediction,final_prediction_2),new_ts_covid_japan$weekly_aggregated_residuals[1:50])
colnames(compare_table) = c("predict","actual")


ggplot(data = new_ts_covid_japan[1:50,]) + 
  geom_line(aes(x=week, y=weekly_aggregated_residuals, color = "black")) + 
  geom_line(aes(x=week, y=compare_table$predict, color = "red"))+ 
    labs(x = "Week",
         y = "Japan Prediction vs. Acutal")+
  scale_color_manual(name = c("color"),values = c("black","red"),labels = c("Actual","Prediction"))+
  ggtitle("Null Model, ARIMA(2,1,1)")

plot(new_ts_covid_japan$week[46:50],                              # Draw first time series
     new_ts_covid_japan$weekly_aggregated_residuals[46:50],
     type = "l",
     xlab = "Date",
     ylab = "Japan Prediction vs. Acutal: Null Model")
lines(new_ts_covid_japan$week[46:50],                            # Draw second time series
      final_prediction_2,
      type = "l",
      col = "red")
legend(x = "topright",legend=c("actual", "prediction"),
       col=c("black", "red"), lty=1, cex=0.5)

#Training MSE Japan Null
SSE_train = sum((final_prediction - compare_table$actual[1:length(final_prediction)])^2)
MSE_train = SSE_train/length(final_prediction)
MSE_train
## [1] 3730.581
#Testing MSE
SSE_test = sum((final_prediction_2 - compare_table$actual[length(final_prediction)+1:length(final_prediction_2)])^2)
MSE_test = SSE_test/length(final_prediction_2)
MSE_test
## [1] 147076.2
regression_model = lm(weekly_aggregated_residuals~vaccination_policy, data = new_ts_covid_japan[1:45,]) 
summary(regression_model)
## 
## Call:
## lm(formula = weekly_aggregated_residuals ~ vaccination_policy, 
##     data = new_ts_covid_japan[1:45, ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -362.40 -125.99  -29.33   57.61  738.43 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)   
## (Intercept)        -121.423     72.188  -1.682  0.09981 . 
## vaccination_policy   10.686      3.078   3.472  0.00119 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 240.5 on 43 degrees of freedom
## Multiple R-squared:  0.2189, Adjusted R-squared:  0.2008 
## F-statistic: 12.05 on 1 and 43 DF,  p-value: 0.001191
anova(regression_model)
## Analysis of Variance Table
## 
## Response: weekly_aggregated_residuals
##                    Df  Sum Sq Mean Sq F value   Pr(>F)   
## vaccination_policy  1  697232  697232  12.052 0.001191 **
## Residuals          43 2487612   57851                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
error = residuals(regression_model)
ts_error = ts(error)

#ARIMA on error

error_model = auto.arima(ts_error)
fitted_error = fitted(error_model)
summary(error_model)
## Series: ts_error 
## ARIMA(2,0,0) with zero mean 
## 
## Coefficients:
##          ar1      ar2
##       1.6107  -0.8406
## s.e.  0.0741   0.0727
## 
## sigma^2 estimated as 3944:  log likelihood=-251.08
## AIC=508.16   AICc=508.74   BIC=513.58
## 
## Training set error measures:
##                     ME    RMSE      MAE      MPE     MAPE     MASE       ACF1
## Training set -1.962991 61.3877 39.73173 28.29837 45.90808 0.527788 0.09894474
checkresiduals(error_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,0,0) with zero mean
## Q* = 4.3985, df = 7, p-value = 0.7329
## 
## Model df: 2.   Total lags used: 9
#goodness of fit
regression_prediction = predict(regression_model)

final_prediction = regression_prediction + fitted_error

regression_prediction = predict(regression_model, newdata = new_ts_covid_japan[46:50,])
error_forecast =  predict(error_model, n.ahead = 5)$pred

final_prediction_2 = regression_prediction + error_forecast


#compare prediction and actual


compare_table = data.frame(c(final_prediction,final_prediction_2),new_ts_covid_japan$weekly_aggregated_residuals[1:50])
colnames(compare_table) = c("predict","actual")


ggplot(data = new_ts_covid_japan[1:50,]) + 
  geom_line(aes(x=week, y=weekly_aggregated_residuals, color = "black")) + 
  geom_line(aes(x=week, y=compare_table$predict, color = "red"))+ 
    labs(x = "Week",
         y = "Japan Prediction vs. Acutal")+
  scale_color_manual(name = c("color"),values = c("black","red"),labels = c("Actual","Prediction"))+
  ggtitle("Full Model, ARIMA(2,0,0)")

plot(new_ts_covid_japan$week[46:50],                              # Draw first time series
     new_ts_covid_japan$weekly_aggregated_residuals[46:50],
     type = "l",
     xlab = "Date",
     ylab = "Japan Prediction vs. Acutal: Full Model")
lines(new_ts_covid_japan$week[46:50],                            # Draw second time series
      final_prediction_2,
      type = "l",
      col = "red")
legend(x = "topright",legend=c("actual", "prediction"),
       col=c("black", "red"), lty=1, cex=0.5)

#Training MSE Japan Full
SSE_train = sum((final_prediction - compare_table$actual[1:length(final_prediction)])^2)
MSE_train = SSE_train/length(final_prediction)
MSE_train
## [1] 3768.449
#Testing MSE
SSE_test = sum((final_prediction_2 - compare_table$actual[length(final_prediction)+1:length(final_prediction_2)])^2)
MSE_test = SSE_test/length(final_prediction_2)
MSE_test
## [1] 113560.9
#Mexico Example on aggregate weekly
ts_covid_mexico = ts(covid_mexico$residuals,frequency = 7)
ts_covid_mexico_2 = ts(colSums(matrix(ts_covid_mexico, nrow=7))) 
## Warning in matrix(ts_covid_mexico, nrow = 7): data length [401] is not a sub-
## multiple or multiple of the number of rows [7]
vaccine_policy = colSums(matrix(as.numeric(covid_mexico$vaccination_policy), nrow=7))
## Warning in matrix(as.numeric(covid_mexico$vaccination_policy), nrow = 7): data
## length [401] is not a sub-multiple or multiple of the number of rows [7]
new_ts_covid_mexico = data.frame(1:58,ts_covid_mexico_2,vaccine_policy)
names(new_ts_covid_mexico) = c("week","weekly_aggregated_residuals","vaccination_policy")

plot.ts(ts_covid_mexico_2)

regression_model = lm(weekly_aggregated_residuals~1, data = new_ts_covid_mexico[1:53,]) 
summary(regression_model)
## 
## Call:
## lm(formula = weekly_aggregated_residuals ~ 1, data = new_ts_covid_mexico[1:53, 
##     ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -300.90 -191.13  -68.37  126.46  545.70 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -274.58      32.93  -8.338 3.73e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 239.7 on 52 degrees of freedom
anova(regression_model)
## Analysis of Variance Table
## 
## Response: weekly_aggregated_residuals
##           Df  Sum Sq Mean Sq F value Pr(>F)
## Residuals 52 2988441   57470
error = residuals(regression_model)
ts_error = ts(error)

#ARIMA on error

error_model = auto.arima(ts_error)
fitted_error = fitted(error_model)
summary(error_model)
## Series: ts_error 
## ARIMA(1,0,0) with zero mean 
## 
## Coefficients:
##          ar1
##       0.8794
## s.e.  0.0589
## 
## sigma^2 estimated as 11890:  log likelihood=-324.1
## AIC=652.21   AICc=652.45   BIC=656.15
## 
## Training set error measures:
##                    ME     RMSE      MAE      MPE     MAPE     MASE       ACF1
## Training set 1.576358 108.0099 79.89582 86.89919 149.0427 1.004145 0.06268612
checkresiduals(error_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,0) with zero mean
## Q* = 9.3113, df = 9, p-value = 0.4091
## 
## Model df: 1.   Total lags used: 10
#goodness of fit
regression_prediction = predict(regression_model)

final_prediction = regression_prediction + fitted_error

regression_prediction = predict(regression_model, newdata = new_ts_covid_mexico[54:58,])
error_forecast =  predict(error_model, n.ahead = 5)$pred

final_prediction_2 = regression_prediction + error_forecast


#compare prediction and actual


compare_table = data.frame(c(final_prediction,final_prediction_2),new_ts_covid_mexico$weekly_aggregated_residuals[1:58])
colnames(compare_table) = c("predict","actual")


ggplot(data = new_ts_covid_mexico[1:58,]) + 
  geom_line(aes(x=week, y=weekly_aggregated_residuals, color = "black")) + 
  geom_line(aes(x=week, y=compare_table$predict, color = "red"))+ 
    labs(x = "Week",
         y = "Mexico Prediction vs. Acutal")+
  scale_color_manual(name = c("color"),values = c("black","red"),labels = c("Actual","Prediction"))+
  ggtitle("Null Model, ARIMA(1,0,0)")

plot(new_ts_covid_mexico$week[54:58],                              # Draw first time series
     new_ts_covid_mexico$weekly_aggregated_residuals[54:58],
     type = "l",
     xlab = "Date",
     ylab = "Mexico Prediction vs. Acutal: Null Model")
lines(new_ts_covid_mexico$week[54:58],                            # Draw second time series
      final_prediction_2,
      type = "l",
      col = "red")
legend(x = "topright",legend=c("actual", "prediction"),
       col=c("black", "red"), lty=1, cex=0.5)

#Training MSE Mexico Null
SSE_train = sum((final_prediction - compare_table$actual[1:length(final_prediction)])^2)
MSE_train = SSE_train/length(final_prediction)
MSE_train
## [1] 11666.14
#Testing MSE
SSE_test = sum((final_prediction_2 - compare_table$actual[length(final_prediction)+1:length(final_prediction_2)])^2)
MSE_test = SSE_test/length(final_prediction_2)
MSE_test
## [1] 29676.86
regression_model = lm(weekly_aggregated_residuals~vaccination_policy, data = new_ts_covid_mexico[1:53,]) 
summary(regression_model)
## 
## Call:
## lm(formula = weekly_aggregated_residuals ~ vaccination_policy, 
##     data = new_ts_covid_mexico[1:53, ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -331.85 -183.81  -38.24  165.89  545.70 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -396.699     66.218  -5.991 2.12e-07 ***
## vaccination_policy    5.467      2.598   2.104   0.0403 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 232.2 on 51 degrees of freedom
## Multiple R-squared:  0.0799, Adjusted R-squared:  0.06185 
## F-statistic: 4.428 on 1 and 51 DF,  p-value: 0.04029
anova(regression_model)
## Analysis of Variance Table
## 
## Response: weekly_aggregated_residuals
##                    Df  Sum Sq Mean Sq F value  Pr(>F)  
## vaccination_policy  1  238762  238762  4.4285 0.04029 *
## Residuals          51 2749679   53915                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
error = residuals(regression_model)
ts_error = ts(error)

#ARIMA on error

error_model = auto.arima(ts_error)
fitted_error = fitted(error_model)
summary(error_model)
## Series: ts_error 
## ARIMA(1,0,0) with zero mean 
## 
## Coefficients:
##          ar1
##       0.8691
## s.e.  0.0615
## 
## sigma^2 estimated as 11834:  log likelihood=-323.94
## AIC=651.88   AICc=652.12   BIC=655.82
## 
## Training set error measures:
##                     ME     RMSE      MAE      MPE    MAPE     MASE       ACF1
## Training set -1.042131 107.7525 80.94973 74.20563 236.201 1.010714 0.05946579
checkresiduals(error_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,0) with zero mean
## Q* = 7.9807, df = 9, p-value = 0.5361
## 
## Model df: 1.   Total lags used: 10
#goodness of fit
regression_prediction = predict(regression_model)

final_prediction = regression_prediction + fitted_error

regression_prediction = predict(regression_model, newdata = new_ts_covid_mexico[54:58,])
error_forecast =  predict(error_model, n.ahead = 5)$pred

final_prediction_2 = regression_prediction + error_forecast


#compare prediction and actual


compare_table = data.frame(c(final_prediction,final_prediction_2),new_ts_covid_mexico$weekly_aggregated_residuals[1:58])
colnames(compare_table) = c("predict","actual")


ggplot(data = new_ts_covid_mexico[1:58,]) + 
  geom_line(aes(x=week, y=weekly_aggregated_residuals, color = "black")) + 
  geom_line(aes(x=week, y=compare_table$predict, color = "red"))+ 
    labs(x = "Week",
         y = "Mexico Prediction vs. Acutal")+
  scale_color_manual(name = c("color"),values = c("black","red"),labels = c("Actual","Prediction"))+
  ggtitle("Full Model, ARIMA(1,0,0)")

plot(new_ts_covid_mexico$week[54:58],                              # Draw first time series
     new_ts_covid_mexico$weekly_aggregated_residuals[54:58],
     type = "l",
     xlab = "Date",
     ylab = "Mexico Prediction vs. Acutal: Full Model")
lines(new_ts_covid_mexico$week[54:58],                            # Draw second time series
      final_prediction_2,
      type = "l",
      col = "red")
legend(x = "topright",legend=c("actual", "prediction"),
       col=c("black", "red"), lty=1, cex=0.5)

#Training MSE Mexico Full
SSE_train = sum((final_prediction - compare_table$actual[1:length(final_prediction)])^2)
MSE_train = SSE_train/length(final_prediction)
MSE_train
## [1] 11610.6
#Testing MSE
SSE_test = sum((final_prediction_2 - compare_table$actual[length(final_prediction)+1:length(final_prediction_2)])^2)
MSE_test = SSE_test/length(final_prediction_2)
MSE_test
## [1] 34994.13
#Morocco Example on aggregate weekly
ts_covid_morocco = ts(covid_morocco$residuals,frequency = 7)
ts_covid_morocco_2 = ts(colSums(matrix(ts_covid_morocco, nrow=7))) 
## Warning in matrix(ts_covid_morocco, nrow = 7): data length [337] is not a sub-
## multiple or multiple of the number of rows [7]
vaccine_policy = colSums(matrix(as.numeric(covid_morocco$vaccination_policy), nrow=7))
## Warning in matrix(as.numeric(covid_morocco$vaccination_policy), nrow = 7): data
## length [337] is not a sub-multiple or multiple of the number of rows [7]
new_ts_covid_morocco = data.frame(1:49,ts_covid_morocco_2,vaccine_policy)
names(new_ts_covid_morocco) = c("week","weekly_aggregated_residuals","vaccination_policy")

plot.ts(ts_covid_morocco_2)

regression_model = lm(weekly_aggregated_residuals~1, data = new_ts_covid_morocco[1:44,]) 
summary(regression_model)
## 
## Call:
## lm(formula = weekly_aggregated_residuals ~ 1, data = new_ts_covid_morocco[1:44, 
##     ])
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -333.5 -305.8 -225.6  196.7 1303.9 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   225.32      65.22   3.455  0.00125 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 432.6 on 43 degrees of freedom
anova(regression_model)
## Analysis of Variance Table
## 
## Response: weekly_aggregated_residuals
##           Df  Sum Sq Mean Sq F value Pr(>F)
## Residuals 43 8048484  187174
error = residuals(regression_model)
ts_error = ts(error)

#ARIMA on error

error_model = auto.arima(ts_error)
fitted_error = fitted(error_model)
summary(error_model)
## Series: ts_error 
## ARIMA(1,0,2) with zero mean 
## 
## Coefficients:
##          ar1     ma1     ma2
##       0.7349  1.0149  0.5319
## s.e.  0.1147  0.1967  0.1703
## 
## sigma^2 estimated as 15781:  log likelihood=-275.26
## AIC=558.53   AICc=559.55   BIC=565.66
## 
## Training set error measures:
##                     ME     RMSE      MAE      MPE     MAPE      MASE
## Training set -1.740112 121.2659 62.09287 26.98198 47.22166 0.6566894
##                     ACF1
## Training set -0.02084803
checkresiduals(error_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,2) with zero mean
## Q* = 4.1937, df = 6, p-value = 0.6505
## 
## Model df: 3.   Total lags used: 9
#goodness of fit
regression_prediction = predict(regression_model)

final_prediction = regression_prediction + fitted_error

regression_prediction = predict(regression_model, newdata = new_ts_covid_morocco[45:49,])
error_forecast =  predict(error_model, n.ahead = 5)$pred

final_prediction_2 = regression_prediction + error_forecast


#compare prediction and actual


compare_table = data.frame(c(final_prediction,final_prediction_2),new_ts_covid_morocco$weekly_aggregated_residuals[1:49])
colnames(compare_table) = c("predict","actual")


ggplot(data = new_ts_covid_morocco[1:49,]) + 
  geom_line(aes(x=week, y=weekly_aggregated_residuals, color = "black")) + 
  geom_line(aes(x=week, y=compare_table$predict, color = "red"))+ 
    labs(x = "Week",
         y = "Morocco Prediction vs. Acutal")+
  scale_color_manual(name = c("color"),values = c("black","red"),labels = c("Actual","Prediction"))+
  ggtitle("Null Model, ARIMA(1,0,2)")

plot(new_ts_covid_morocco$week[45:49],                              # Draw first time series
     new_ts_covid_morocco$weekly_aggregated_residuals[45:49],
     type = "l",
     xlab = "Date",
     ylab = "Morocco Prediction vs. Acutal: Null Model")
lines(new_ts_covid_morocco$week[45:49],                            # Draw second time series
      final_prediction_2,
      type = "l",
      col = "red")
legend(x = "topright",legend=c("actual", "prediction"),
       col=c("black", "red"), lty=1, cex=0.5)

#Training MSE Morocco Null
SSE_train = sum((final_prediction - compare_table$actual[1:length(final_prediction)])^2)
MSE_train = SSE_train/length(final_prediction)
MSE_train
## [1] 14705.42
#Testing MSE
SSE_test = sum((final_prediction_2 - compare_table$actual[length(final_prediction)+1:length(final_prediction_2)])^2)
MSE_test = SSE_test/length(final_prediction_2)
MSE_test
## [1] 64940.89
regression_model = lm(weekly_aggregated_residuals~vaccination_policy, data = new_ts_covid_morocco[1:44,]) 
summary(regression_model)
## 
## Call:
## lm(formula = weekly_aggregated_residuals ~ vaccination_policy, 
##     data = new_ts_covid_morocco[1:44, ])
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -343.2 -306.9 -221.8  187.1 1294.2 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)
## (Intercept)         200.738    155.054   1.295    0.203
## vaccination_policy    1.224      6.984   0.175    0.862
## 
## Residual standard error: 437.6 on 42 degrees of freedom
## Multiple R-squared:  0.0007302,  Adjusted R-squared:  -0.02306 
## F-statistic: 0.03069 on 1 and 42 DF,  p-value: 0.8618
anova(regression_model)
## Analysis of Variance Table
## 
## Response: weekly_aggregated_residuals
##                    Df  Sum Sq Mean Sq F value Pr(>F)
## vaccination_policy  1    5877    5877  0.0307 0.8618
## Residuals          42 8042607  191491
error = residuals(regression_model)
ts_error = ts(error)

#ARIMA on error

error_model = auto.arima(ts_error)
fitted_error = fitted(error_model)
summary(error_model)
## Series: ts_error 
## ARIMA(1,0,2) with zero mean 
## 
## Coefficients:
##          ar1     ma1     ma2
##       0.7345  1.0162  0.5379
## s.e.  0.1140  0.1919  0.1677
## 
## sigma^2 estimated as 15727:  log likelihood=-275.2
## AIC=558.39   AICc=559.42   BIC=565.53
## 
## Training set error measures:
##                     ME     RMSE      MAE       MPE     MAPE      MASE
## Training set -1.961342 121.0585 62.39035 -1.124799 42.34422 0.6570673
##                     ACF1
## Training set -0.01884444
checkresiduals(error_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,2) with zero mean
## Q* = 3.9824, df = 6, p-value = 0.6791
## 
## Model df: 3.   Total lags used: 9
#goodness of fit
regression_prediction = predict(regression_model)

final_prediction = regression_prediction + fitted_error

regression_prediction = predict(regression_model, newdata = new_ts_covid_morocco[45:49,])
error_forecast =  predict(error_model, n.ahead = 5)$pred

final_prediction_2 = regression_prediction + error_forecast


#compare prediction and actual


compare_table = data.frame(c(final_prediction,final_prediction_2),new_ts_covid_morocco$weekly_aggregated_residuals[1:49])
colnames(compare_table) = c("predict","actual")


ggplot(data = new_ts_covid_morocco[1:49,]) + 
  geom_line(aes(x=week, y=weekly_aggregated_residuals, color = "black")) + 
  geom_line(aes(x=week, y=compare_table$predict, color = "red"))+ 
    labs(x = "Week",
         y = "Morocco Prediction vs. Acutal")+
  scale_color_manual(name = c("color"),values = c("black","red"),labels = c("Actual","Prediction"))+
  ggtitle("Full Model, ARIMA(1,0,2)")

plot(new_ts_covid_morocco$week[45:49],                              # Draw first time series
     new_ts_covid_morocco$weekly_aggregated_residuals[45:49],
     type = "l",
     xlab = "Date",
     ylab = "Morocco Prediction vs. Acutal: Full Model")
lines(new_ts_covid_morocco$week[45:49],                            # Draw second time series
      final_prediction_2,
      type = "l",
      col = "red")
legend(x = "topright",legend=c("actual", "prediction"),
       col=c("black", "red"), lty=1, cex=0.5)

#Training MSE Morocco Full
SSE_train = sum((final_prediction - compare_table$actual[1:length(final_prediction)])^2)
MSE_train = SSE_train/length(final_prediction)
MSE_train
## [1] 14655.16
#Testing MSE
SSE_test = sum((final_prediction_2 - compare_table$actual[length(final_prediction)+1:length(final_prediction_2)])^2)
MSE_test = SSE_test/length(final_prediction_2)
MSE_test
## [1] 69560.69
#Portugal Example on aggregate weekly
ts_covid_portugal = ts(covid_portugal$residuals,frequency = 7)
ts_covid_portugal_2 = ts(colSums(matrix(ts_covid_portugal, nrow=7))) 
## Warning in matrix(ts_covid_portugal, nrow = 7): data length [397] is not a sub-
## multiple or multiple of the number of rows [7]
vaccine_policy = colSums(matrix(as.numeric(covid_portugal$vaccination_policy), nrow=7))
## Warning in matrix(as.numeric(covid_portugal$vaccination_policy), nrow = 7): data
## length [397] is not a sub-multiple or multiple of the number of rows [7]
new_ts_covid_portugal = data.frame(1:57,ts_covid_portugal_2,vaccine_policy)
names(new_ts_covid_portugal) = c("week","weekly_aggregated_residuals","vaccination_policy")

plot.ts(ts_covid_portugal_2)

regression_model = lm(weekly_aggregated_residuals~1, data = new_ts_covid_portugal[1:52,]) 
summary(regression_model)
## 
## Call:
## lm(formula = weekly_aggregated_residuals ~ 1, data = new_ts_covid_portugal[1:52, 
##     ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1962.3 -1357.8  -502.9   735.9  6327.9 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)    484.8      264.3   1.835   0.0724 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1906 on 51 degrees of freedom
anova(regression_model)
## Analysis of Variance Table
## 
## Response: weekly_aggregated_residuals
##           Df    Sum Sq Mean Sq F value Pr(>F)
## Residuals 51 185211019 3631589
error = residuals(regression_model)
ts_error = ts(error)

#ARIMA on error

error_model = auto.arima(ts_error)
fitted_error = fitted(error_model)
summary(error_model)
## Series: ts_error 
## ARIMA(0,1,1) 
## 
## Coefficients:
##          ma1
##       0.6961
## s.e.  0.1046
## 
## sigma^2 estimated as 438577:  log likelihood=-403.47
## AIC=810.94   AICc=811.19   BIC=814.8
## 
## Training set error measures:
##                     ME     RMSE      MAE       MPE     MAPE     MASE       ACF1
## Training set -5.741977 649.3911 408.5719 -28.33632 84.46924 0.858771 0.05847121
checkresiduals(error_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,1)
## Q* = 11.852, df = 9, p-value = 0.2218
## 
## Model df: 1.   Total lags used: 10
#goodness of fit
regression_prediction = predict(regression_model)

final_prediction = regression_prediction + fitted_error

regression_prediction = predict(regression_model, newdata = new_ts_covid_portugal[53:57,])
error_forecast =  predict(error_model, n.ahead = 5)$pred

final_prediction_2 = regression_prediction + error_forecast


#compare prediction and actual


compare_table = data.frame(c(final_prediction,final_prediction_2),new_ts_covid_portugal$weekly_aggregated_residuals[1:57])
colnames(compare_table) = c("predict","actual")


ggplot(data = new_ts_covid_portugal[1:57,]) + 
  geom_line(aes(x=week, y=weekly_aggregated_residuals, color = "black")) + 
  geom_line(aes(x=week, y=compare_table$predict, color = "red"))+ 
    labs(x = "Week",
         y = "Portugal Prediction vs. Acutal")+
  scale_color_manual(name = c("color"),values = c("black","red"),labels = c("Actual","Prediction"))+
  ggtitle("Null Model, ARIMA(0,1,1)")

plot(new_ts_covid_portugal$week[53:57],                              # Draw first time series
     new_ts_covid_portugal$weekly_aggregated_residuals[53:57],
     type = "l",
     xlab = "Date",
     ylab = "Portugal Prediction vs. Acutal: Null Model")
lines(new_ts_covid_portugal$week[53:57],                            # Draw second time series
      final_prediction_2,
      type = "l",
      col = "red")
legend(x = "topright",legend=c("actual", "prediction"),
       col=c("black", "red"), lty=1, cex=0.5)

#Training MSE Portugual Null
SSE_train = sum((final_prediction - compare_table$actual[1:length(final_prediction)])^2)
MSE_train = SSE_train/length(final_prediction)
MSE_train
## [1] 421708.9
#Testing MSE
SSE_test = sum((final_prediction_2 - compare_table$actual[length(final_prediction)+1:length(final_prediction_2)])^2)
MSE_test = SSE_test/length(final_prediction_2)
MSE_test
## [1] 31015.46
regression_model = lm(weekly_aggregated_residuals~vaccination_policy, data = new_ts_covid_portugal[1:52,]) 
summary(regression_model)
## 
## Call:
## lm(formula = weekly_aggregated_residuals ~ vaccination_policy, 
##     data = new_ts_covid_portugal[1:52, ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2384.6  -949.8  -431.6   416.2  5975.8 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         2339.15     557.95   4.192 0.000112 ***
## vaccination_policy   -71.53      19.49  -3.671 0.000588 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1708 on 50 degrees of freedom
## Multiple R-squared:  0.2123, Adjusted R-squared:  0.1965 
## F-statistic: 13.47 on 1 and 50 DF,  p-value: 0.0005879
anova(regression_model)
## Analysis of Variance Table
## 
## Response: weekly_aggregated_residuals
##                    Df    Sum Sq  Mean Sq F value    Pr(>F)    
## vaccination_policy  1  39315862 39315862  13.474 0.0005879 ***
## Residuals          50 145895157  2917903                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
error = residuals(regression_model)
ts_error = ts(error)

#ARIMA on error

error_model = auto.arima(ts_error)
fitted_error = fitted(error_model)
summary(error_model)
## Series: ts_error 
## ARIMA(2,0,0) with zero mean 
## 
## Coefficients:
##          ar1      ar2
##       1.4329  -0.6293
## s.e.  0.1035   0.1043
## 
## sigma^2 estimated as 389096:  log likelihood=-408.67
## AIC=823.35   AICc=823.85   BIC=829.2
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE     MAPE     MASE       ACF1
## Training set 1.549144 611.6622 364.5818 -12.97327 64.07895 0.742551 0.02914117
checkresiduals(error_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,0,0) with zero mean
## Q* = 13.85, df = 8, p-value = 0.08577
## 
## Model df: 2.   Total lags used: 10
#goodness of fit
regression_prediction = predict(regression_model)

final_prediction = regression_prediction + fitted_error

regression_prediction = predict(regression_model, newdata = new_ts_covid_portugal[53:57,])
error_forecast =  predict(error_model, n.ahead = 5)$pred

final_prediction_2 = regression_prediction + error_forecast


#compare prediction and actual


compare_table = data.frame(c(final_prediction,final_prediction_2),new_ts_covid_portugal$weekly_aggregated_residuals[1:57])
colnames(compare_table) = c("predict","actual")


ggplot(data = new_ts_covid_portugal[1:57,]) + 
  geom_line(aes(x=week, y=weekly_aggregated_residuals, color = "black")) + 
  geom_line(aes(x=week, y=compare_table$predict, color = "red"))+ 
    labs(x = "Week",
         y = "Portugal Prediction vs. Acutal")+
  scale_color_manual(name = c("color"),values = c("black","red"),labels = c("Actual","Prediction"))+
  ggtitle("Full Model, ARIMA(2,0,0)")

plot(new_ts_covid_portugal$week[53:57],                              # Draw first time series
     new_ts_covid_portugal$weekly_aggregated_residuals[53:57],
     type = "l",
     xlab = "Date",
     ylab = "Portugal Prediction vs. Acutal: Full Model")
lines(new_ts_covid_portugal$week[53:57],                            # Draw second time series
      final_prediction_2,
      type = "l",
      col = "red")
legend(x = "topright",legend=c("actual", "prediction"),
       col=c("black", "red"), lty=1, cex=0.5)

#Training MSE Portugual Full
SSE_train = sum((final_prediction - compare_table$actual[1:length(final_prediction)])^2)
MSE_train = SSE_train/length(final_prediction)
MSE_train
## [1] 374130.7
#Testing MSE
SSE_test = sum((final_prediction_2 - compare_table$actual[length(final_prediction)+1:length(final_prediction_2)])^2)
MSE_test = SSE_test/length(final_prediction_2)
MSE_test
## [1] 142490.8
#Russia Example on aggregate weekly
ts_covid_russia = ts(covid_russia$residuals,frequency = 7)
ts_covid_russia_2 = ts(colSums(matrix(ts_covid_russia, nrow=7))) 
## Warning in matrix(ts_covid_russia, nrow = 7): data length [248] is not a sub-
## multiple or multiple of the number of rows [7]
vaccine_policy = colSums(matrix(as.numeric(covid_russia$vaccination_policy), nrow=7))
## Warning in matrix(as.numeric(covid_russia$vaccination_policy), nrow = 7): data
## length [248] is not a sub-multiple or multiple of the number of rows [7]
new_ts_covid_russia = data.frame(1:36,ts_covid_russia_2,vaccine_policy)
names(new_ts_covid_russia) = c("week","weekly_aggregated_residuals","vaccination_policy")

plot.ts(ts_covid_russia_2)

regression_model = lm(weekly_aggregated_residuals~1, data = new_ts_covid_russia[1:31,]) 
summary(regression_model)
## 
## Call:
## lm(formula = weekly_aggregated_residuals ~ 1, data = new_ts_covid_russia[1:31, 
##     ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -298.18 -252.58  -98.95  229.25  536.28 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -394.32      50.45  -7.816 1.01e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 280.9 on 30 degrees of freedom
anova(regression_model)
## Analysis of Variance Table
## 
## Response: weekly_aggregated_residuals
##           Df  Sum Sq Mean Sq F value Pr(>F)
## Residuals 30 2367105   78904
error = residuals(regression_model)
ts_error = ts(error)

#ARIMA on error

error_model = auto.arima(ts_error)
fitted_error = fitted(error_model)
summary(error_model)
## Series: ts_error 
## ARIMA(1,1,0) 
## 
## Coefficients:
##          ar1
##       0.6921
## s.e.  0.1436
## 
## sigma^2 estimated as 3745:  log likelihood=-165.81
## AIC=335.62   AICc=336.06   BIC=338.42
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE     MAPE      MASE       ACF1
## Training set 4.446286 59.19288 46.38008 -60.96461 81.01134 0.7137882 -0.1062188
checkresiduals(error_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,0)
## Q* = 7.5889, df = 5, p-value = 0.1804
## 
## Model df: 1.   Total lags used: 6
#goodness of fit
regression_prediction = predict(regression_model)

final_prediction = regression_prediction + fitted_error

regression_prediction = predict(regression_model, newdata = new_ts_covid_russia[32:36,])
error_forecast =  predict(error_model, n.ahead = 5)$pred

final_prediction_2 = regression_prediction + error_forecast


#compare prediction and actual


compare_table = data.frame(c(final_prediction,final_prediction_2),new_ts_covid_portugal$weekly_aggregated_residuals[1:36])
colnames(compare_table) = c("predict","actual")


ggplot(data = new_ts_covid_russia[1:36,]) + 
  geom_line(aes(x=week, y=weekly_aggregated_residuals, color = "black")) + 
  geom_line(aes(x=week, y=compare_table$predict, color = "red"))+ 
    labs(x = "Week",
         y = "Russia Prediction vs. Acutal")+
  scale_color_manual(name = c("color"),values = c("black","red"),labels = c("Actual","Prediction"))+
  ggtitle("Null Model, ARIMA(1,1,0)")

plot(new_ts_covid_russia$week[32:36],                              # Draw first time series
     new_ts_covid_russia$weekly_aggregated_residuals[32:36],
     type = "l",
     xlab = "Date",
     ylab = "Russia Prediction vs. Acutal: Null Model")
lines(new_ts_covid_russia$week[32:36],                            # Draw second time series
      final_prediction_2,
      type = "l",
      col = "red")
legend(x = "topright",legend=c("actual", "prediction"),
       col=c("black", "red"), lty=1, cex=0.5)

#Training MSE Russia Null
SSE_train = sum((final_prediction - compare_table$actual[1:length(final_prediction)])^2)
MSE_train = SSE_train/length(final_prediction)
MSE_train
## [1] 6554158
#Testing MSE
SSE_test = sum((final_prediction_2 - compare_table$actual[length(final_prediction)+1:length(final_prediction_2)])^2)
MSE_test = SSE_test/length(final_prediction_2)
MSE_test
## [1] 676377
regression_model = lm(weekly_aggregated_residuals~vaccination_policy, data = new_ts_covid_russia[1:31,]) 
summary(regression_model)
## 
## Call:
## lm(formula = weekly_aggregated_residuals ~ vaccination_policy, 
##     data = new_ts_covid_russia[1:31, ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -441.83  -86.89  -27.87  167.53  318.34 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -85.952     67.210  -1.279    0.211    
## vaccination_policy  -12.918      2.375  -5.439 7.48e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 201 on 29 degrees of freedom
## Multiple R-squared:  0.505,  Adjusted R-squared:  0.4879 
## F-statistic: 29.59 on 1 and 29 DF,  p-value: 7.483e-06
anova(regression_model)
## Analysis of Variance Table
## 
## Response: weekly_aggregated_residuals
##                    Df  Sum Sq Mean Sq F value    Pr(>F)    
## vaccination_policy  1 1195422 1195422  29.587 7.483e-06 ***
## Residuals          29 1171684   40403                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
error = residuals(regression_model)
ts_error = ts(error)

#ARIMA on error

error_model = auto.arima(ts_error)
fitted_error = fitted(error_model)
summary(error_model)
## Series: ts_error 
## ARIMA(2,0,1) with zero mean 
## 
## Coefficients:
##          ar1      ar2      ma1
##       1.8773  -0.9499  -0.8101
## s.e.  0.0532   0.0475   0.1599
## 
## sigma^2 estimated as 4223:  log likelihood=-173.7
## AIC=355.39   AICc=356.93   BIC=361.13
## 
## Training set error measures:
##                    ME     RMSE     MAE       MPE     MAPE      MASE     ACF1
## Training set 4.363854 61.75662 48.8188 -13.63794 69.21278 0.7729344 -0.14973
checkresiduals(error_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,0,1) with zero mean
## Q* = 5.7662, df = 3, p-value = 0.1236
## 
## Model df: 3.   Total lags used: 6
#goodness of fit
regression_prediction = predict(regression_model)

final_prediction = regression_prediction + fitted_error

regression_prediction = predict(regression_model, newdata = new_ts_covid_russia[32:36,])
error_forecast =  predict(error_model, n.ahead = 5)$pred

final_prediction_2 = regression_prediction + error_forecast


#compare prediction and actual


compare_table = data.frame(c(final_prediction,final_prediction_2),new_ts_covid_portugal$weekly_aggregated_residuals[1:36])
colnames(compare_table) = c("predict","actual")


ggplot(data = new_ts_covid_russia[1:36,]) + 
  geom_line(aes(x=week, y=weekly_aggregated_residuals, color = "black")) + 
  geom_line(aes(x=week, y=compare_table$predict, color = "red"))+ 
    labs(x = "Week",
         y = "Russia Prediction vs. Acutal")+
  scale_color_manual(name = c("color"),values = c("black","red"),labels = c("Actual","Prediction"))+
  ggtitle("Full Model, ARIMA(2,0,1)")

plot(new_ts_covid_russia$week[32:36],                              # Draw first time series
     new_ts_covid_russia$weekly_aggregated_residuals[32:36],
     type = "l",
     xlab = "Date",
     ylab = "Russia Prediction vs. Acutal: Full Model")
lines(new_ts_covid_russia$week[32:36],                            # Draw second time series
      final_prediction_2,
      type = "l",
      col = "red")
legend(x = "topright",legend=c("actual", "prediction"),
       col=c("black", "red"), lty=1, cex=0.5)

#Training MSE Russia Full
SSE_train = sum((final_prediction - compare_table$actual[1:length(final_prediction)])^2)
MSE_train = SSE_train/length(final_prediction)
MSE_train
## [1] 6568545
#Testing MSE
SSE_test = sum((final_prediction_2 - compare_table$actual[length(final_prediction)+1:length(final_prediction_2)])^2)
MSE_test = SSE_test/length(final_prediction_2)
MSE_test
## [1] 436139.8
#Saudi Example on aggregate weekly
ts_covid_saudi = ts(covid_saudi$residuals,frequency = 7)
ts_covid_saudi_2 = ts(colSums(matrix(ts_covid_saudi, nrow=7))) 
## Warning in matrix(ts_covid_saudi, nrow = 7): data length [403] is not a sub-
## multiple or multiple of the number of rows [7]
vaccine_policy = colSums(matrix(as.numeric(covid_saudi$vaccination_policy), nrow=7))
## Warning in matrix(as.numeric(covid_saudi$vaccination_policy), nrow = 7): data
## length [403] is not a sub-multiple or multiple of the number of rows [7]
new_ts_covid_saudi = data.frame(1:58,ts_covid_saudi_2,vaccine_policy)
names(new_ts_covid_saudi) = c("week","weekly_aggregated_residuals","vaccination_policy")

plot.ts(ts_covid_saudi_2)

regression_model = lm(weekly_aggregated_residuals~1, data = new_ts_covid_saudi[1:53,]) 
summary(regression_model)
## 
## Call:
## lm(formula = weekly_aggregated_residuals ~ 1, data = new_ts_covid_saudi[1:53, 
##     ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -166.55  -21.28    2.99   22.05   90.64 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -233.321      6.229  -37.46   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 45.35 on 52 degrees of freedom
anova(regression_model)
## Analysis of Variance Table
## 
## Response: weekly_aggregated_residuals
##           Df Sum Sq Mean Sq F value Pr(>F)
## Residuals 52 106929  2056.3
error = residuals(regression_model)
ts_error = ts(error)

#ARIMA on error

error_model = auto.arima(ts_error)
fitted_error = fitted(error_model)
summary(error_model)
## Series: ts_error 
## ARIMA(0,1,3) 
## 
## Coefficients:
##           ma1      ma2     ma3
##       -0.2827  -0.4785  0.2656
## s.e.   0.1348   0.1521  0.1309
## 
## sigma^2 estimated as 1286:  log likelihood=-258.75
## AIC=525.5   AICc=526.35   BIC=533.3
## 
## Training set error measures:
##                     ME    RMSE      MAE       MPE     MAPE      MASE
## Training set -3.336452 34.4829 25.19053 -21.89949 189.2627 0.9423321
##                     ACF1
## Training set -0.03222228
checkresiduals(error_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,3)
## Q* = 9.723, df = 7, p-value = 0.2048
## 
## Model df: 3.   Total lags used: 10
#goodness of fit
regression_prediction = predict(regression_model)

final_prediction = regression_prediction + fitted_error

regression_prediction = predict(regression_model, newdata = new_ts_covid_saudi[54:58,])
error_forecast =  predict(error_model, n.ahead = 5)$pred

final_prediction_2 = regression_prediction + error_forecast


#compare prediction and actual


compare_table = data.frame(c(final_prediction,final_prediction_2),new_ts_covid_saudi$weekly_aggregated_residuals[1:58])
colnames(compare_table) = c("predict","actual")


ggplot(data = new_ts_covid_saudi[1:58,]) + 
  geom_line(aes(x=week, y=weekly_aggregated_residuals, color = "black")) + 
  geom_line(aes(x=week, y=compare_table$predict, color = "red"))+ 
    labs(x = "Week",
         y = "Saudi Prediction vs. Acutal")+
  scale_color_manual(name = c("color"),values = c("black","red"),labels = c("Actual","Prediction"))+
  ggtitle("Null Model, ARIMA(0,1,3)")

plot(new_ts_covid_saudi$week[54:58],                              # Draw first time series
     new_ts_covid_saudi$weekly_aggregated_residuals[54:58],
     type = "l",
     xlab = "Date",
     ylab = "Saudi Prediction vs. Acutal: Null Model")
lines(new_ts_covid_saudi$week[54:58],                            # Draw second time series
      final_prediction_2,
      type = "l",
      col = "red")
legend(x = "topright",legend=c("actual", "prediction"),
       col=c("black", "red"), lty=1, cex=0.5)

#Training MSE Saudi Null
SSE_train = sum((final_prediction - compare_table$actual[1:length(final_prediction)])^2)
MSE_train = SSE_train/length(final_prediction)
MSE_train
## [1] 1189.071
#Testing MSE
SSE_test = sum((final_prediction_2 - compare_table$actual[length(final_prediction)+1:length(final_prediction_2)])^2)
MSE_test = SSE_test/length(final_prediction_2)
MSE_test
## [1] 788.4918
regression_model = lm(weekly_aggregated_residuals~vaccination_policy, data = new_ts_covid_saudi[1:53,]) 
summary(regression_model)
## 
## Call:
## lm(formula = weekly_aggregated_residuals ~ vaccination_policy, 
##     data = new_ts_covid_saudi[1:53, ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -164.978  -23.462    2.543   23.060   92.212 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -228.7081    15.1013 -15.145   <2e-16 ***
## vaccination_policy   -0.1472     0.4382  -0.336    0.738    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 45.74 on 51 degrees of freedom
## Multiple R-squared:  0.002207,   Adjusted R-squared:  -0.01736 
## F-statistic: 0.1128 on 1 and 51 DF,  p-value: 0.7383
anova(regression_model)
## Analysis of Variance Table
## 
## Response: weekly_aggregated_residuals
##                    Df Sum Sq Mean Sq F value Pr(>F)
## vaccination_policy  1    236  236.05  0.1128 0.7383
## Residuals          51 106693 2092.03
error = residuals(regression_model)
ts_error = ts(error)

#ARIMA on error

error_model = auto.arima(ts_error)
fitted_error = fitted(error_model)
summary(error_model)
## Series: ts_error 
## ARIMA(0,0,1) with zero mean 
## 
## Coefficients:
##          ma1
##       0.6956
## s.e.  0.1116
## 
## sigma^2 estimated as 1340:  log likelihood=-265.83
## AIC=535.67   AICc=535.91   BIC=539.61
## 
## Training set error measures:
##                      ME     RMSE      MAE      MPE     MAPE      MASE
## Training set -0.3146459 36.25454 25.51332 42.58368 210.6015 0.9520885
##                    ACF1
## Training set 0.03685257
checkresiduals(error_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,0,1) with zero mean
## Q* = 15.487, df = 9, p-value = 0.0784
## 
## Model df: 1.   Total lags used: 10
#goodness of fit
regression_prediction = predict(regression_model)

final_prediction = regression_prediction + fitted_error

regression_prediction = predict(regression_model, newdata = new_ts_covid_saudi[54:58,])
error_forecast =  predict(error_model, n.ahead = 5)$pred

final_prediction_2 = regression_prediction + error_forecast


#compare prediction and actual


compare_table = data.frame(c(final_prediction,final_prediction_2),new_ts_covid_saudi$weekly_aggregated_residuals[1:58])
colnames(compare_table) = c("predict","actual")


ggplot(data = new_ts_covid_saudi[1:58,]) + 
  geom_line(aes(x=week, y=weekly_aggregated_residuals, color = "black")) + 
  geom_line(aes(x=week, y=compare_table$predict, color = "red"))+ 
    labs(x = "Week",
         y = "Saudi Prediction vs. Acutal")+
  scale_color_manual(name = c("color"),values = c("black","red"),labels = c("Actual","Prediction"))+
  ggtitle("Full Model, ARIMA(0,0,1)")

plot(new_ts_covid_saudi$week[54:58],                              # Draw first time series
     new_ts_covid_saudi$weekly_aggregated_residuals[54:58],
     type = "l",
     xlab = "Date",
     ylab = "Saudi Prediction vs. Acutal: Full Model")
lines(new_ts_covid_saudi$week[54:58],                            # Draw second time series
      final_prediction_2,
      type = "l",
      col = "red")
legend(x = "topright",legend=c("actual", "prediction"),
       col=c("black", "red"), lty=1, cex=0.5)

#Training MSE Saudi Full
SSE_train = sum((final_prediction - compare_table$actual[1:length(final_prediction)])^2)
MSE_train = SSE_train/length(final_prediction)
MSE_train
## [1] 1314.392
#Testing MSE
SSE_test = sum((final_prediction_2 - compare_table$actual[length(final_prediction)+1:length(final_prediction_2)])^2)
MSE_test = SSE_test/length(final_prediction_2)
MSE_test
## [1] 2212.317
#Uganda Example on aggregate weekly
covid_uganda <- readRDS("data/covid_north_weekly.RData")
 covid_uganda <-
   covid_uganda %>% filter(location == "Uganda") %>%
   mutate("week" = 1:43)
regression_model = lm(adjusted_weekly_cases_per_million~1, data = covid_uganda[1:38,]) 
summary(regression_model)
## 
## Call:
## lm(formula = adjusted_weekly_cases_per_million ~ 1, data = covid_uganda[1:38, 
##     ])
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -38.42 -34.87 -17.96  13.24 150.40 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -86.698      7.691  -11.27 1.59e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 47.41 on 37 degrees of freedom
anova(regression_model)
## Analysis of Variance Table
## 
## Response: adjusted_weekly_cases_per_million
##           Df Sum Sq Mean Sq F value Pr(>F)
## Residuals 37  83172  2247.9
error = residuals(regression_model)
ts_error = ts(error)

#ARIMA on error

error_model = auto.arima(ts_error)
fitted_error = fitted(error_model)
summary(error_model)
## Series: ts_error 
## ARIMA(2,0,0) with zero mean 
## 
## Coefficients:
##          ar1      ar2
##       1.2996  -0.5073
## s.e.  0.1343   0.1335
## 
## sigma^2 estimated as 412.8:  log likelihood=-168.3
## AIC=342.61   AICc=343.32   BIC=347.52
## 
## Training set error measures:
##                      ME     RMSE      MAE       MPE    MAPE      MASE
## Training set -0.1984356 19.77471 12.94145 -14.11046 79.2795 0.8859306
##                     ACF1
## Training set 0.008854984
checkresiduals(error_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,0,0) with zero mean
## Q* = 1.9946, df = 6, p-value = 0.9202
## 
## Model df: 2.   Total lags used: 8
#goodness of fit
regression_prediction = predict(regression_model)

final_prediction = regression_prediction + fitted_error

regression_prediction = predict(regression_model, newdata = covid_uganda[39:43,])
error_forecast =  predict(error_model, n.ahead = 5)$pred

final_prediction_2 = regression_prediction + error_forecast


#compare prediction and actual


compare_table = data.frame(c(final_prediction,final_prediction_2),covid_uganda$adjusted_weekly_cases_per_million[1:43])
colnames(compare_table) = c("predict","actual")


ggplot(data = covid_uganda[1:43,]) + 
  geom_line(aes(x=week, y=adjusted_weekly_cases_per_million, color = "black")) + 
  geom_line(aes(x=week, y=compare_table$predict, color = "red"))+ 
    labs(x = "Week",
         y = "Uganda Prediction vs. Acutal")+
  scale_color_manual(name = c("color"),values = c("black","red"),labels = c("Actual","Prediction"))+
  ggtitle("Null Model, ARIMA(2,0,0)")

plot(covid_uganda$week[39:43],                              # Draw first time series
     covid_uganda$adjusted_weekly_cases_per_million[39:43],
     type = "l",
     xlab = "Date",
     ylab = "Uganda Prediction vs. Acutal: Null Model")
lines(covid_uganda$week[39:43],                            # Draw second time series
      final_prediction_2,
      type = "l",
      col = "red")
legend(x = "topright",legend=c("actual", "prediction"),
       col=c("black", "red"), lty=1, cex=0.5)

#Training MSE South Africa Null
SSE_train = sum((final_prediction - compare_table$actual[1:length(final_prediction)])^2)
MSE_train = SSE_train/length(final_prediction)
MSE_train
## [1] 391.0393
#Testing MSE
SSE_test = sum((final_prediction_2 - compare_table$actual[length(final_prediction)+1:length(final_prediction_2)])^2)
MSE_test = SSE_test/length(final_prediction_2)
MSE_test
## [1] 614.8072
regression_model = lm(adjusted_weekly_cases_per_million~as.numeric(vaccination_policy), data = covid_uganda[1:38,]) 
summary(regression_model)
## 
## Call:
## lm(formula = adjusted_weekly_cases_per_million ~ as.numeric(vaccination_policy), 
##     data = covid_uganda[1:38, ])
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -46.48 -30.68 -15.23  22.22 131.14 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    -119.722     15.733  -7.610  5.3e-09 ***
## as.numeric(vaccination_policy)   13.072      5.526   2.365   0.0235 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 44.72 on 36 degrees of freedom
## Multiple R-squared:  0.1345, Adjusted R-squared:  0.1105 
## F-statistic: 5.596 on 1 and 36 DF,  p-value: 0.02351
anova(regression_model)
## Analysis of Variance Table
## 
## Response: adjusted_weekly_cases_per_million
##                                Df Sum Sq Mean Sq F value  Pr(>F)  
## as.numeric(vaccination_policy)  1  11188 11188.5  5.5955 0.02351 *
## Residuals                      36  71983  1999.5                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
error = residuals(regression_model)
ts_error = ts(error)

#ARIMA on error

error_model = auto.arima(ts_error)
fitted_error = fitted(error_model)
summary(error_model)
## Series: ts_error 
## ARIMA(2,0,0) with zero mean 
## 
## Coefficients:
##          ar1      ar2
##       1.2597  -0.4583
## s.e.  0.1403   0.1415
## 
## sigma^2 estimated as 393.1:  log likelihood=-167.32
## AIC=340.64   AICc=341.35   BIC=345.56
## 
## Training set error measures:
##                     ME     RMSE      MAE      MPE     MAPE      MASE
## Training set -0.523593 19.29831 12.53067 236.7103 253.1082 0.8790713
##                     ACF1
## Training set -0.01965143
checkresiduals(error_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,0,0) with zero mean
## Q* = 2.0956, df = 6, p-value = 0.9107
## 
## Model df: 2.   Total lags used: 8
#goodness of fit
regression_prediction = predict(regression_model)

final_prediction = regression_prediction + fitted_error

regression_prediction = predict(regression_model, newdata = covid_uganda[39:43,])
error_forecast =  predict(error_model, n.ahead = 5)$pred

final_prediction_2 = regression_prediction + error_forecast


#compare prediction and actual


compare_table = data.frame(c(final_prediction,final_prediction_2),covid_uganda$adjusted_weekly_cases_per_million[1:43])
colnames(compare_table) = c("predict","actual")


ggplot(data = covid_uganda[1:43,]) + 
  geom_line(aes(x=week, y=adjusted_weekly_cases_per_million, color = "black")) + 
  geom_line(aes(x=week, y=compare_table$predict, color = "red"))+ 
    labs(x = "Week",
         y = "Uganda Prediction vs. Acutal")+
  scale_color_manual(name = c("color"),values = c("black","red"),labels = c("Actual","Prediction"))+
  ggtitle("Full Model, ARIMA(2,0,0)")

plot(covid_uganda$week[39:43],                              # Draw first time series
     covid_uganda$adjusted_weekly_cases_per_million[39:43],
     type = "l",
     xlab = "Date",
     ylab = "Uganda Prediction vs. Acutal: Null Model")
lines(covid_uganda$week[39:43],                            # Draw second time series
      final_prediction_2,
      type = "l",
      col = "red")
legend(x = "topright",legend=c("actual", "prediction"),
       col=c("black", "red"), lty=1, cex=0.5)

#Training MSE South Africa Null
SSE_train = sum((final_prediction - compare_table$actual[1:length(final_prediction)])^2)
MSE_train = SSE_train/length(final_prediction)
MSE_train
## [1] 372.425
#Testing MSE
SSE_test = sum((final_prediction_2 - compare_table$actual[length(final_prediction)+1:length(final_prediction_2)])^2)
MSE_test = SSE_test/length(final_prediction_2)
MSE_test
## [1] 2288.198
#United Kingdom Example on aggregate weekly
ts_covid_uk = ts(covid_uk$residuals,frequency = 7)
ts_covid_uk_2 = ts(colSums(matrix(ts_covid_uk, nrow=7))) 
## Warning in matrix(ts_covid_uk, nrow = 7): data length [415] is not a sub-
## multiple or multiple of the number of rows [7]
vaccine_policy = colSums(matrix(as.numeric(covid_uk$vaccination_policy), nrow=7))
## Warning in matrix(as.numeric(covid_uk$vaccination_policy), nrow = 7): data
## length [415] is not a sub-multiple or multiple of the number of rows [7]
new_ts_covid_uk = data.frame(1:60,ts_covid_uk_2,vaccine_policy)
names(new_ts_covid_uk) = c("week","weekly_aggregated_residuals","vaccination_policy")

plot.ts(ts_covid_uk_2)

regression_model = lm(weekly_aggregated_residuals~1, data = new_ts_covid_uk[1:55,]) 
summary(regression_model)
## 
## Call:
## lm(formula = weekly_aggregated_residuals ~ 1, data = new_ts_covid_uk[1:55, 
##     ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2579.4 -1632.6   376.3  1036.9  4272.8 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)   -211.2      225.2  -0.938    0.352
## 
## Residual standard error: 1670 on 54 degrees of freedom
anova(regression_model)
## Analysis of Variance Table
## 
## Response: weekly_aggregated_residuals
##           Df    Sum Sq Mean Sq F value Pr(>F)
## Residuals 54 150595053 2788797
error = residuals(regression_model)
ts_error = ts(error)

#ARIMA on error

error_model = auto.arima(ts_error)
fitted_error = fitted(error_model)
summary(error_model)
## Series: ts_error 
## ARIMA(2,0,0) with zero mean 
## 
## Coefficients:
##          ar1      ar2
##       1.4156  -0.5145
## s.e.  0.1123   0.1121
## 
## sigma^2 estimated as 248034:  log likelihood=-419.95
## AIC=845.9   AICc=846.37   BIC=851.92
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE     MAPE      MASE       ACF1
## Training set 13.37828 488.8913 351.8551 -65.33358 112.3605 0.8099452 0.03082382
checkresiduals(error_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,0,0) with zero mean
## Q* = 8.4026, df = 8, p-value = 0.3952
## 
## Model df: 2.   Total lags used: 10
#goodness of fit
regression_prediction = predict(regression_model)

final_prediction = regression_prediction + fitted_error

regression_prediction = predict(regression_model, newdata = new_ts_covid_uk[56:60,])
error_forecast =  predict(error_model, n.ahead = 5)$pred

final_prediction_2 = regression_prediction + error_forecast


#compare prediction and actual


compare_table = data.frame(c(final_prediction,final_prediction_2),new_ts_covid_uk$weekly_aggregated_residuals[1:60])
colnames(compare_table) = c("predict","actual")


ggplot(data = new_ts_covid_uk[1:60,]) + 
  geom_line(aes(x=week, y=weekly_aggregated_residuals, color = "black")) + 
  geom_line(aes(x=week, y=compare_table$predict, color = "red"))+ 
    labs(x = "Week",
         y = "UK Prediction vs. Acutal")+
  scale_color_manual(name = c("color"),values = c("black","red"),labels = c("Actual","Prediction"))+
  ggtitle("Null Model, ARIMA(2,0,0)")

plot(new_ts_covid_uk$week[56:60],                              # Draw first time series
     new_ts_covid_uk$weekly_aggregated_residuals[56:60],
     type = "l",
     xlab = "Date",
     ylab = "United Kingdom Prediction vs. Acutal: Null Model")
lines(new_ts_covid_uk$week[56:60],                            # Draw second time series
      final_prediction_2,
      type = "l",
      col = "red")
legend(x = "topright",legend=c("actual", "prediction"),
       col=c("black", "red"), lty=1, cex=0.5)

#Training MSE UK Null
SSE_train = sum((final_prediction - compare_table$actual[1:length(final_prediction)])^2)
MSE_train = SSE_train/length(final_prediction)
MSE_train
## [1] 239014.7
#Testing MSE
SSE_test = sum((final_prediction_2 - compare_table$actual[length(final_prediction)+1:length(final_prediction_2)])^2)
MSE_test = SSE_test/length(final_prediction_2)
MSE_test
## [1] 790416
regression_model = lm(weekly_aggregated_residuals~vaccination_policy, data = new_ts_covid_uk[1:55,]) 
summary(regression_model)
## 
## Call:
## lm(formula = weekly_aggregated_residuals ~ vaccination_policy, 
##     data = new_ts_covid_uk[1:55, ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2587.1 -1511.6    52.3  1008.9  4265.1 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)
## (Intercept)          310.95     547.88   0.568    0.573
## vaccination_policy   -18.37      17.58  -1.045    0.301
## 
## Residual standard error: 1669 on 53 degrees of freedom
## Multiple R-squared:  0.0202, Adjusted R-squared:  0.001709 
## F-statistic: 1.092 on 1 and 53 DF,  p-value: 0.3007
anova(regression_model)
## Analysis of Variance Table
## 
## Response: weekly_aggregated_residuals
##                    Df    Sum Sq Mean Sq F value Pr(>F)
## vaccination_policy  1   3041426 3041426  1.0925 0.3007
## Residuals          53 147553627 2784031
error = residuals(regression_model)
ts_error = ts(error)

#ARIMA on error

error_model = auto.arima(ts_error)
fitted_error = fitted(error_model)
summary(error_model)
## Series: ts_error 
## ARIMA(2,0,0) with zero mean 
## 
## Coefficients:
##          ar1      ar2
##       1.4245  -0.5277
## s.e.  0.1113   0.1115
## 
## sigma^2 estimated as 252383:  log likelihood=-420.43
## AIC=846.86   AICc=847.33   BIC=852.89
## 
## Training set error measures:
##                    ME     RMSE      MAE      MPE     MAPE      MASE       ACF1
## Training set 16.17417 493.1582 359.2741 4.718285 59.38791 0.8055654 0.02330247
checkresiduals(error_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,0,0) with zero mean
## Q* = 8.123, df = 8, p-value = 0.4215
## 
## Model df: 2.   Total lags used: 10
#goodness of fit
regression_prediction = predict(regression_model)

final_prediction = regression_prediction + fitted_error

regression_prediction = predict(regression_model, newdata = new_ts_covid_uk[56:60,])
error_forecast =  predict(error_model, n.ahead = 5)$pred

final_prediction_2 = regression_prediction + error_forecast


#compare prediction and actual


compare_table = data.frame(c(final_prediction,final_prediction_2),new_ts_covid_uk$weekly_aggregated_residuals[1:60])
colnames(compare_table) = c("predict","actual")


ggplot(data = new_ts_covid_uk[1:60,]) + 
  geom_line(aes(x=week, y=weekly_aggregated_residuals, color = "black")) + 
  geom_line(aes(x=week, y=compare_table$predict, color = "red"))+ 
    labs(x = "Week",
         y = "UK Prediction vs. Acutal")+
  scale_color_manual(name = c("color"),values = c("black","red"),labels = c("Actual","Prediction"))+
  ggtitle("Full Model, ARIMA(2,0,0)")

plot(new_ts_covid_uk$week[56:60],                              # Draw first time series
     new_ts_covid_uk$weekly_aggregated_residuals[56:60],
     type = "l",
     xlab = "Date",
     ylab = "United Kingdom Prediction vs. Acutal: Full Model")
lines(new_ts_covid_uk$week[56:60],                            # Draw second time series
      final_prediction_2,
      type = "l",
      col = "red")
legend(x = "topright",legend=c("actual", "prediction"),
       col=c("black", "red"), lty=1, cex=0.5)

#Training MSE UK Full
SSE_train = sum((final_prediction - compare_table$actual[1:length(final_prediction)])^2)
MSE_train = SSE_train/length(final_prediction)
MSE_train
## [1] 243205
#Testing MSE
SSE_test = sum((final_prediction_2 - compare_table$actual[length(final_prediction)+1:length(final_prediction_2)])^2)
MSE_test = SSE_test/length(final_prediction_2)
MSE_test
## [1] 1020226